merge upstream, test=develop

424a2ce5 · zlsh80826 · e3fc4cff · 3eee0467 · 424a2ce5 · 424a2ce5
64 changed file
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -54,10 +54,9 @@ void HdfsStore::set(const std::string& key, const std::vector<char>& data) {
      paddle::framework::fs_remove(tmp);
      if (i == retry_times_) {
        VLOG(0) << "fs_open_write failed, retry times reaches limit";
-        // PADDLE_THROW(platform::errors::PreconditionNotMet(
-        //    "fs_open_write failed, retry times reaches"
-        //    " limit ",
-        //    retry_times_));
+        PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+            "fs_open_write failed, retry times reaches %d limit.",
+            retry_times_));
      }
    } else {
      break;
@@ -143,9 +142,9 @@ void HdfsStore::wait(const std::vector<std::string>& keys,
          break;
        }
      }
-      // PADDLE_THROW(platform::errors::ExecutionTimeout(
-      VLOG(0) << "TIMEOUT self_rank = " << self_rank_
-              << " pair_rank = " << last_check_rank;
+      PADDLE_THROW(paddle::platform::errors::ExecutionTimeout(
+          "TIMEOUT self_rank = %d pair_rank = %d", self_rank_,
+          last_check_rank));
    }
    std::this_thread::sleep_for(std::chrono::milliseconds(wait_sleep_ms_));
  }

--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -28,16 +28,32 @@ DEFINE_bool(enable_unused_var_check, false,
            "Checking whether operator contains unused inputs, "
            "especially for grad operator. It should be in unittest.");

-// NOTE(zhiqiu): Currently, there are some operators which involves unused
-// inputs and cannot be removed from the allow_list below.
-// They can be mainly divided into four categories:
-// 0: the inputs of which are only used in if branch, or used in cuda kernel but
-// not in cpu kernel;
-// 1: the inputs of which are used to indicate dtype of outputs;
-// 2: the inputs of which are used in fused operators.
-// The category number is presented in the comments after each operator.
+namespace paddle {
+namespace framework {
+
+std::unordered_set<std::string> *GetThreadLocalUsedVarNameSet() {
+  thread_local std::unordered_set<std::string> used_var_name_set;
+  return &used_var_name_set;
+}
+
+void LogVarUsageIfUnusedVarCheckEnabled(const std::string &name) {
+  if (FLAGS_enable_unused_var_check) {
+    VLOG(6) << "Variable used:" << name;
+    GetThreadLocalUsedVarNameSet()->insert(name);
+  }
+}

-const std::unordered_set<std::string> op_with_unsed_vars_allow_list = {
+static const std::unordered_set<std::string> &GetOpWithUnusedVarAllowSet() {
+  // NOTE(zhiqiu): Currently, there are some operators which involves unused
+  // inputs and cannot be removed from the allow_list below.
+  // They can be mainly divided into four categories:
+  // 0: the inputs of which are only used in if branch, or used in cuda kernel
+  // but not in cpu kernel; 1: the inputs of which are used to indicate dtype of
+  // outputs; 2: the inputs of which are used in fused operators. The category
+  // number is presented in the comments after each operator.
+  // Use pointer here for safe static deinitialization
+  static auto *allow_set = new std::unordered_set<std::string>({
+      // called once
      "batch_norm",                      // 0
      "batch_norm_grad",                 // 0
      "sync_batch_norm",                 // 0
@@ -57,27 +73,14 @@ const std::unordered_set<std::string> op_with_unsed_vars_allow_list = {
      "fused_batch_norm_act",            // 2
      "fused_batch_norm_act_grad",       // 2
      "data_norm",                       // 0
-    "data_norm_grad",                  // 0
-};
-
-namespace paddle {
-namespace framework {
-
-std::unordered_set<std::string> *GetThreadLocalUsedVarNameSet() {
-  thread_local std::unordered_set<std::string> used_var_name_set;
-  return &used_var_name_set;
-}
-
-void LogVarUsageIfUnusedVarCheckEnabled(const std::string &name) {
-  if (FLAGS_enable_unused_var_check) {
-    VLOG(6) << "Variable used:" << name;
-    GetThreadLocalUsedVarNameSet()->insert(name);
-  }
+      "data_norm_grad",                  // 0);
+  });
+  return *allow_set;
 }

 void CheckUnusedVar(const OperatorBase &op, const Scope &scope) {
  // skip op in allow list.
-  if (op_with_unsed_vars_allow_list.count(op.Type()) != 0) {
+  if (GetOpWithUnusedVarAllowSet().count(op.Type()) != 0) {
    return;
  }
  auto *used_set = GetThreadLocalUsedVarNameSet();

--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -56,9 +56,11 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
  return static_cast<nvinfer1::IRuntime*>(
      dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
 }
-static nvinfer1::IPluginRegistry* getPluginRegistry() {
+#if IS_TRT_VERSION_GE(6000)
+static nvinfer1::IPluginRegistry* GetPluginRegistry() {
  return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
 }
+#endif

 // A logger for create TensorRT infer builder.
 class NaiveLogger : public nvinfer1::ILogger {

--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -178,12 +178,16 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
  std::string name_space_;
  std::string plugin_base_;
 };
-#endif

 template <typename T>
 class TrtPluginRegistrarV2 {
 public:
-  TrtPluginRegistrarV2() { getPluginRegistry()->registerCreator(creator, ""); }
+  TrtPluginRegistrarV2() {
+    static auto func_ptr = GetPluginRegistry();
+    if (func_ptr != nullptr) {
+      func_ptr->registerCreator(creator, "");
+    }
+  }

 private:
  T creator;
@@ -193,6 +197,8 @@ class TrtPluginRegistrarV2 {
  static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
      plugin_registrar_##name {}

+#endif
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference

--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -241,6 +241,156 @@ class Flatten2GradOp : public framework::OperatorWithKernel {
  }
 };

+class FlattenContiguousRangeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FlattenContiguousRange");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FlattenContiguousRange");
+    const auto &start_axis = ctx->Attrs().Get<int>("start_axis");
+    const auto &stop_axis = ctx->Attrs().Get<int>("stop_axis");
+    const auto &in_dims = ctx->GetInputDim("X");
+    int in_dims_size = in_dims.size();
+    int real_start_axis = start_axis, real_stop_axis = stop_axis;
+    if (start_axis < 0) {
+      real_start_axis = start_axis + in_dims_size;
+    }
+    if (stop_axis < 0) {
+      real_stop_axis = stop_axis + in_dims_size;
+    }
+    PADDLE_ENFORCE_GE(
+        real_stop_axis, real_start_axis,
+        platform::errors::InvalidArgument("The stop_axis should be greater"
+                                          "than or equal to start_axis."));
+
+    const auto &out_dims =
+        GetOutputShape(real_start_axis, real_stop_axis, in_dims);
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (in_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+
+    OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2");
+    std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      xshape_dims[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", "XShape");
+  }
+
+  static std::vector<int32_t> GetOutputShape(const int start_axis,
+                                             const int stop_axis,
+                                             const framework::DDim &in_dims) {
+    int64_t outer = 1;
+    std::vector<int32_t> out_shape;
+    int in_dims_size = in_dims.size();
+    out_shape.reserve(in_dims_size - stop_axis + start_axis);
+
+    for (int i = 0; i < start_axis; ++i) {
+      out_shape.push_back(in_dims[i]);
+    }
+    for (int i = start_axis; i <= stop_axis; i++) {
+      outer *= in_dims[i];
+    }
+    out_shape.push_back(outer);
+    for (int i = stop_axis + 1; i < in_dims_size; i++) {
+      out_shape.push_back(in_dims[i]);
+    }
+
+    return out_shape;
+  }
+};
+
+class FlattenContiguousRangeOpMaker : public FlattenOpMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) A tensor of rank >= axis.");
+    AddOutput("Out",
+              "A 2D tensor is reshaped input tensor. The input dimensions"
+              "up to axis are flattened to the outer dimension of the output"
+              "and the remaining input dimensions are flattened into the inner"
+              "dimension of the output.");
+    AddAttr<int>("start_axis",
+                 "(int)"
+                 "Indicate the input start dimension (exclusive) to flatten")
+        .SetDefault(1);
+    AddAttr<int>("stop_axis",
+                 "(int)"
+                 "Indicate the input stop dimension (exclusive) to flatten")
+        .SetDefault(1);
+    AddComment(R"DOC(
+Flatten Operator
+
+Flattens the input tensor into a new matrix according to start_axis and stop_axis.
+
+Examples:
+Case 1:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    start_axis = 2, stop_axis = -1
+  We get:
+    Out.shape = (3, 100, 400)
+
+Case 2:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    start_axis = 0, stop_axis = -1
+  We get:
+    Out.shape = (3 * 100 * 100 * 4)
+)DOC");
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+
+template <typename T>
+class FlattenContiguousRangeGradOpMaker
+    : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("flatten_contiguous_range_grad");
+    grad_op->SetInput("XShape", this->Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+class FlattenContiguousRangeGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("XShape"), "Input", "XShape",
+                   "FlattenContiguousRangeGrad");
+    OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "FlattenContiguousRangeGrad");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
 DECLARE_INPLACE_OP_INFERER(FlattenOpInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(FlattenGradInplaceInferer,
                           {framework::GradVarName("Out"),
@@ -266,6 +416,16 @@ REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
 REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
                  ops::FlattenGradInplaceInferer);

+REGISTER_OPERATOR(
+    flatten_contiguous_range, ops::FlattenContiguousRangeOp,
+    ops::FlattenContiguousRangeOpMaker,
+    ops::FlattenContiguousRangeGradOpMaker<paddle::framework::OpDesc>,
+    ops::FlattenContiguousRangeGradOpMaker<paddle::imperative::OpBase>,
+    ops::FlattenOpInplaceInferer);
+REGISTER_OPERATOR(flatten_contiguous_range_grad,
+                  ops::FlattenContiguousRangeGradOp,
+                  ops::FlattenGradInplaceInferer);
+
 REGISTER_OP_CPU_KERNEL(
    flatten, ops::FlattenKernel<paddle::platform::CPUDeviceContext, float>,
    ops::FlattenKernel<paddle::platform::CPUDeviceContext, double>,
@@ -292,3 +452,26 @@ REGISTER_OP_CPU_KERNEL(
    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int>,
    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    flatten_contiguous_range,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
+                                      float>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
+                                      double>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
+                                      int8_t>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
+                                      int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    flatten_contiguous_range_grad,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          float>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          double>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          int>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          int8_t>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          int64_t>);
--- a/paddle/fluid/operators/flatten_op.cu.cc
+++ b/paddle/fluid/operators/flatten_op.cu.cc
@@ -42,3 +42,26 @@ REGISTER_OP_CUDA_KERNEL(
    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    flatten_contiguous_range,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
+                                      float>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
+                                      double>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
+                                      int8_t>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
+                                      int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    flatten_contiguous_range_grad,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          float>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          double>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          int>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          int8_t>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t>);
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -112,5 +112,73 @@ class Flatten2GradKernel : public framework::OpKernel<T> {
  }
 };

+template <typename DeviceContext, typename T>
+class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &start_axis = context.Attr<int>("start_axis");
+    auto &stop_axis = context.Attr<int>("stop_axis");
+
+    auto *in = context.Input<framework::LoDTensor>("X");
+    auto x_dims = in->dims();
+    int in_dims_size = x_dims.size();
+    int real_start_axis = start_axis, real_stop_axis = stop_axis;
+    if (start_axis < 0) {
+      real_start_axis = start_axis + in_dims_size;
+    }
+    if (stop_axis < 0) {
+      real_stop_axis = stop_axis + in_dims_size;
+    }
+    auto *out = context.Output<framework::LoDTensor>("Out");
+
+    auto out_dims = framework::make_ddim(
+        GetOutputShape(real_start_axis, real_stop_axis, x_dims));
+
+    out->mutable_data(context.GetPlace(), in->type());
+    framework::TensorCopy(
+        *in, context.GetPlace(),
+        context.template device_context<platform::DeviceContext>(), out);
+    out->Resize(out_dims);
+  }
+  static std::vector<int32_t> GetOutputShape(const int start_axis,
+                                             const int stop_axis,
+                                             const framework::DDim &in_dims) {
+    int64_t outer = 1;
+    std::vector<int32_t> out_shape;
+    int in_dims_size = in_dims.size();
+    out_shape.reserve(in_dims_size - stop_axis + start_axis);
+
+    for (int i = 0; i < start_axis; ++i) {
+      out_shape.push_back(in_dims[i]);
+    }
+    for (int i = start_axis; i <= stop_axis; i++) {
+      outer *= in_dims[i];
+    }
+    out_shape.push_back(outer);
+    for (int i = stop_axis + 1; i < in_dims_size; i++) {
+      out_shape.push_back(in_dims[i]);
+    }
+
+    return out_shape;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FlattenContiguousRangeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *d_out =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+
+    auto xshape_dims = ctx.Input<framework::LoDTensor>("XShape")->dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
+    d_x->Resize(x_dims);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.

-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
 #pragma once

 #include <NvInfer.h>
-#ifdef USE_NVINFER_PLUGIN
-#include <NvInferPlugin.h>
-#endif
 #if !defined(_WIN32)
 #include <dlfcn.h>
 #endif
@@ -45,17 +43,14 @@ extern void* tensorrt_plugin_dso_handle;
  struct DynLoad__##__name {                                                  \
    template <typename... Args>                                               \
    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
-      using tensorrt_func = decltype(&::__name);                              \
      std::call_once(tensorrt_dso_flag, []() {                                \
        tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
-        PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle,                          \
-                                platform::errors::Unavailable(                \
-                                    "Load tensorrt %s failed", #__name));     \
      });                                                                     \
      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);          \
-      PADDLE_ENFORCE_NOT_NULL(                                                \
-          p_##__name,                                                         \
-          platform::errors::Unavailable("Load tensorrt %s failed", #__name)); \
+      if (p_##__name == nullptr) {                                            \
+        return nullptr;                                                       \
+      }                                                                       \
+      using tensorrt_func = decltype(&::__name);                              \
      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
    }                                                                         \
  };                                                                          \
@@ -65,28 +60,30 @@ extern void* tensorrt_plugin_dso_handle;
  struct DynLoad__##__name {                                                \
    template <typename... Args>                                             \
    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
-      using tensorrt_plugin_func = decltype(&::__name);                        \
      std::call_once(tensorrt_plugin_dso_flag, []() {                       \
-        tensorrt_plugin_dso_handle =                                           \
+        tensorrt_dso_plugin_handle =                                        \
            paddle::platform::dynload::GetTensorRtPluginHandle();           \
-        PADDLE_ENFORCE_NOT_NULL(                                               \
-            tensorrt_plugin_dso_handle,                                        \
-            platform::errors::Unavailable("Load tensorrt plugin %s failed",    \
-                                          #__name));                           \
      });                                                                   \
      static void* p_##__name = dlsym(tensorrt_plugin_dso_handle, #__name); \
-      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                      \
-                              platform::errors::Unavailable(                   \
-                                  "Load tensorrt plugin %s failed", #__name)); \
+      if (p_##__name == nullptr) {                                          \
+        return nullptr;                                                     \
+      }                                                                     \
+      using tensorrt_plugin_func = decltype(&::__name);                     \
      return reinterpret_cast<tensorrt_plugin_func>(p_##__name)(args...);   \
    }                                                                       \
  };                                                                        \
  extern DynLoad__##__name __name

+#if (NV_TENSORRT_MAJOR >= 6)
 #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
  __macro(createInferBuilder_INTERNAL);     \
  __macro(createInferRuntime_INTERNAL);     \
  __macro(getPluginRegistry);
+#else
+#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
+  __macro(createInferBuilder_INTERNAL);     \
+  __macro(createInferRuntime_INTERNAL);
+#endif

 #define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
  __macro(initLibNvInferPlugins);

--- a/python/paddle/fleet/__init__.py
+++ b/python/paddle/fleet/__init__.py
@@ -16,10 +16,13 @@
 from .base.distributed_strategy import DistributedStrategy
 from .base.fleet_base import Fleet
 from .base.util_factory import UtilBase
-
+from .dataset import *
 #from .base.role_maker import PaddleCloudRoleMaker

-__all__ = ["DistributedStrategy", "UtilBase"]
+__all__ = [
+    "DistributedStrategy", "UtilBase", "DatasetFactory", "DatasetBase",
+    "InMemoryDataset", "QueueDataset"
+]

 fleet = Fleet()
 init = fleet.init

--- a/python/paddle/fleet/base/role_maker.py
+++ b/python/paddle/fleet/base/role_maker.py
@@ -12,5 +12,523 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Defination of Role Makers."""
+import os
+import numpy as np
+from multiprocessing import Process, Manager
+import paddle.fluid as fluid

-# __all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+__all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+
+
+class Role:
+    WORKER = 1
+    SERVER = 2
+
+
+class RoleMakerBase(object):
+    """
+    RoleMakerBase is a base class for assigning a role to current process
+    in distributed training.
+    A paddle developer can implement RoleMakerBase to design a role maker
+    for worker or pserver assignment.
+    """
+
+    def __init__(self):
+        self._worker_endpoints = []
+        self._server_endpoints = []
+        self._role_is_generated = False
+        self._role = None
+        self._current_id = -1
+
+        self._node_type = None
+        self._node_type_comm = None
+        self._all_comm = None
+
+    def is_worker(self):
+        """
+        return is_worker() of current process
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def is_server(self):
+        """
+        return is_server() of current process
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def is_first_worker(self):
+        """
+        Check whether the node is the first instance of worker.
+        Returns:
+            bool: True if this is the first node of worker,
+                  False if not.
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def worker_num(self):
+        """
+        Get current total worker number.
+
+        Returns:
+            int: worker number
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def server_num(self):
+        """
+        Get current total server number.
+
+        Returns:
+            int: server number
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def worker_index(self):
+        """
+        Get current worker id.
+
+        Returns:
+            int: node id
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def server_index(self):
+        """
+        Get current server id.
+
+        Returns:
+            int: node id
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def role_id(self):
+        """
+        Get current id.
+
+        Returns:
+            int: node id
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def get_trainer_endpoints(self):
+        """
+        return trainer endpoints
+        """
+        return self._worker_endpoints
+
+    def get_pserver_endpoints(self):
+        """
+        return pserver endpoints
+        """
+        return self._server_endpoints
+
+    def to_string(self):
+        return "role: {}, current_id: {}, worker_endpoints: {}, server_endpoints: {}".format(
+            self._role, self._current_id, self._worker_endpoints,
+            self._server_endpoints)
+
+    def _all_gather(self, comm_world, input):
+        """
+
+        Args:
+            input(int|float): input value
+
+        Returns:
+            return a list of values
+        """
+        print("warning: RoleMakerBase does not have all gather.")
+        return None
+
+    def _all_reduce(self, comm_world, input, mode="sum"):
+        """
+        Args:
+            input(list/numpy.array): array of one dim
+            output(list/numpy.array): array of one dim
+            mode(str): "sum" or "min" or "max"
+        """
+        print("warning: RoleMakerBase does not have all reduce worker.")
+        return None
+
+    def _barrier(self, comm_world):
+        """
+        barrier between trainers if current role is TRAINER
+        """
+        print("warning: RoleMakerBase does not have barrier worker.")
+
+
+class PaddleCloudRoleMaker(RoleMakerBase):
+    def __init__(self, is_collective=False, init_gloo=True, **kwargs):
+        super(PaddleCloudRoleMaker, self).__init__()
+        self._is_collective = is_collective
+        self._init_gloo = init_gloo
+        self._kwargs = kwargs
+
+        self._role_is_generated = False
+
+        self._server_endpoints = None
+        self._worker_endpoints = None
+
+        self._node_type_comm = None
+        self._all_comm = None
+
+        if not self._is_collective:
+            self._hdfs_name = kwargs.get("hdfs_name", "")
+            self._hdfs_ugi = kwargs.get("hdfs_ugi", "")
+            self._hdfs_path = kwargs.get("path", "").rstrip("/")
+            self._init_timeout_seconds = kwargs.get("init_timeout_seconds",
+                                                    3600)
+            self._run_timeout_seconds = kwargs.get("run_timeout_seconds",
+                                                   9999999)
+            ip_port = kwargs.get("http_ip_port", "")
+            self._http_ip_port = []
+            self._http_server = None
+            # if ip_port is not empty, it will use http instead of hdfs
+            if ip_port != "":
+                self._http_ip_port = ip_port.split(":")
+                # it's for communication between processes
+                self._manager = Manager()
+                # global dict to store status
+                self._http_server_d = self._manager.dict()
+                # set running status of http server
+                self._http_server_d["running"] = False
+            self._iface = self.__get_default_iface()
+            # this environment variable can be empty
+            self._prefix = os.getenv("SYS_JOB_ID", "")
+
+    def _barrier(self, comm_world):
+        if comm_world:
+            comm_world.barrier()
+
+    def _all_gather(self, comm_world, input):
+        if comm_world:
+            self._barrier(comm_world)
+            output = comm_world.all_gather(input)
+            return output
+        else:
+            return None
+
+    def _all_reduce(self, comm_world, input, mode="sum"):
+        if not comm_world:
+            return None
+
+        input = np.array(input)
+
+        input_shape = input.shape
+        input_list = input.reshape(-1).tolist()
+
+        self._barrier(comm_world)
+        ans = comm_world.all_reduce(input_list, mode)
+        output = np.array(ans).reshape(input_shape)
+        return output
+
+    def is_worker(self):
+        """
+        whether current process is worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.WORKER
+
+    def is_server(self):
+        """
+        whether current process is server
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.SERVER
+
+    def is_first_worker(self):
+        """
+        whether current process is worker of rank 0
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.WORKER and self._current_id == 0
+
+    def worker_index(self):
+        """
+        get index of current worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._current_id
+
+    def server_index(self):
+        """
+        get index of current server
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._current_id
+
+    def role_id(self):
+        """
+        get index of current node
+        """
+        if self.is_server():
+            return self.server_index()
+        elif self.is_worker():
+            return self.worker_index()
+
+    def worker_num(self):
+        """
+        retrun the current number of worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._trainers_num
+
+    def server_num(self):
+        """
+        return the current number of server
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._trainers_num
+
+    def get_trainer_endpoints(self):
+        """
+        get endpoint of all trainers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._worker_endpoints
+
+    def get_pserver_endpoints(self):
+        """
+        get endpoint of all pservers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._server_endpoints
+
+    def _get_rank(self):
+        """
+        get current rank in all workers and pservers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._rank
+
+    def _get_size(self):
+        """
+        get total num of all workers and pservers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._size
+
+    def _ps_env(self):
+        try:
+            # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
+            # format: string(ip:port), eg. 127.0.0.1:6001
+            self._server_endpoints = os.environ[
+                "PADDLE_PSERVERS_IP_PORT_LIST"].split(",")
+            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
+                                               "").split(",")
+
+            trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
+            training_role = os.environ["TRAINING_ROLE"]
+
+            if training_role not in ["TRAINER", "PSERVER"]:
+                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
+
+            if training_role == "TRAINER":
+                role = Role.WORKER
+                current_id = int(os.environ["PADDLE_TRAINER_ID"])
+                if len(self._worker_endpoints) > 0:
+                    self._cur_endpoint = self._worker_endpoints[current_id]
+            elif training_role == "PSERVER":
+                role = Role.SERVER
+                port = os.environ["PADDLE_PORT"]
+                ip = os.environ["POD_IP"]
+                self._cur_endpoint = ip + ":" + port
+                current_id = self._server_endpoints.index(self._cur_endpoint)
+            else:
+                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
+        except ValueError as ve:
+            raise ValueError(
+                "something wrong with PaddleCloud, please check environment")
+
+        self._trainers_num = trainers_num
+        self._role = role
+        self._current_id = current_id
+
+    def _collective_env(self):
+        self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
+        assert (self._training_role == "TRAINER")
+        self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+        self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
+        self._worker_endpoints = self._worker_endpoints.split(",")
+        self._trainers_num = len(self._worker_endpoints)
+
+    def _init_gloo_env(self):
+        def init_gloo_instance(role="trainer"):
+            role = role.lower()
+            assert role in ["trainer", "pserver", "all"]
+            if role == "trainer":
+                all_list = self._worker_endpoints
+                rank = self._current_id
+            elif role == "pserver":
+                all_list = self._server_endpoints
+                rank = self._current_id
+            else:
+                all_list = self._worker_endpoints + self._server_endpoints
+                rank = all_list.index(self._cur_endpoint)
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(len(all_list))
+            gloo.set_prefix(self._prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            if len(self._http_ip_port) != 0:
+                gloo.set_http_store(self._http_ip_port[0],
+                                    int(self._http_ip_port[1]), role)
+            else:
+                gloo.set_hdfs_store(self._hdfs_path + "/" + role,
+                                    self._hdfs_name, self._hdfs_ugi)
+            gloo.init()
+            return gloo
+
+        # paddlecloud support gloo
+        if self._role == Role.WORKER:
+            if self._current_id == 0 and len(self._http_ip_port) != 0:
+                size_d = {
+                    "trainer": len(self._worker_endpoints),
+                    "pserver": len(self._server_endpoints),
+                    "all":
+                    len(self._worker_endpoints) + len(self._server_endpoints)
+                }
+                # child process for http server
+                self._http_server = Process(
+                    target=self.__start_kv_server,
+                    args=(self._http_server_d, size_d))
+                self._http_server.daemon = True
+                # set running status to True
+                self._http_server_d["running"] = True
+                # start child process
+                self._http_server.start()
+            self._node_type = 1
+            gloo = init_gloo_instance("trainer")
+            self._node_type_comm = gloo
+        else:
+            assert self._role == Role.SERVER
+            self._node_type = 0
+            gloo = init_gloo_instance("pserver")
+            self._node_type_comm = gloo
+
+        all_list = self._worker_endpoints + self._server_endpoints
+        self._rank = all_list.index(self._cur_endpoint)
+        self._size = len(all_list)
+
+        gloo = init_gloo_instance("all")
+        self._all_comm = gloo
+
+        if self._http_server is not None:
+            # set running status to False
+            self._http_server_d["running"] = False
+            # wait until child process exits
+            self._http_server.join()
+
+    def generate_role(self):
+        """
+        generate role for role maker
+        """
+        if not self._role_is_generated:
+            if not self._is_collective:
+                self._ps_env()
+                if self._init_gloo:
+                    self._init_gloo_env()
+            else:
+                self._collective_env()
+            self._role_is_generated = True
+
+    def __get_default_iface(self):
+        """
+        get default physical interface
+        """
+        default1 = self.__get_default_iface_from_gateway()
+        default2 = self.__get_default_iface_from_interfaces()
+        return default2 if default1 == "lo" else default1
+
+    def __get_default_iface_from_gateway(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        gateways = netifaces.gateways()
+        if gateways.get(netifaces.AF_INET) != None:
+            gateway = gateways[netifaces.AF_INET]
+            if len(gateway) > 0 and len(gateway[0]) > 1:
+                return gateway[0][1]
+        return "lo"
+
+    def __get_default_iface_from_interfaces(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        for intf_name in netifaces.interfaces():
+            addresses = netifaces.ifaddresses(intf_name)
+            if netifaces.AF_INET in addresses:
+                ipv4_addresses = addresses[netifaces.AF_INET]
+                for ipv4_address in ipv4_addresses:
+                    if 'broadcast' in ipv4_address:
+                        return intf_name
+        return "lo"
+
+    def __start_kv_server(self, http_server_d, size_d):
+        from paddle.fleet.utils import KVServer
+        http_server = KVServer(int(self._http_ip_port[1]), size_d)
+        http_server.start()
+        wait_seconds = 5
+        while http_server_d.get("running",
+                                False) and not http_server.shoud_stop():
+            time.sleep(wait_seconds)
+        http_server.stop()
+
+
+class UserDefinedRoleMaker(PaddleCloudRoleMaker):
+    def __init__(self, is_collective=False, init_gloo=False, **kwargs):
+        super(UserDefinedRoleMaker, self).__init__(
+            is_collective=is_collective, init_gloo=init_gloo, **kwargs)
+
+    def _user_defined_ps_env(self):
+        self._server_endpoints = self._kwargs.get("server_endpoints")
+        self._worker_endpoints = self._kwargs.get("worker_endpoints", [])
+        self._trainers_num = self._kwargs.get("worker_num", 0)
+
+        if self._trainers_num == 0:
+            assert (len(self._worker_endpoints) > 0)
+            self._trainers_num = len(self._worker_endpoints)
+
+        self._role = self._kwargs.get("role")
+        self._current_id = self._kwargs.get("current_id")
+
+        if self._role == Role.WORKER and len(
+                self._worker_endpoints) > self._current_id:
+            self._cur_endpoint = self._worker_endpoints[self._current_id]
+        elif self._role == Role.SERVER:
+            self._cur_endpoint = self._server_endpoints[self._current_id]
+
+    def _user_defined_collective_env(self):
+        self._worker_endpoints = self._kwargs.get("worker_endpoints")
+        self._current_id = self._kwargs.get("current_id")
+        self._trainers_num = len(self._worker_endpoints)
+        self._training_role = Role.Worker
+
+    def generate_role(self):
+        """
+        generate role for role maker
+        """
+        if not self._role_is_generated:
+            if not self._is_collective:
+                self._user_defined_ps_env()
+                if self._init_gloo:
+                    self._init_gloo_env()
+            else:
+                self._user_defined_collective_env()
+            self._role_is_generated = True
--- a/python/paddle/fleet/base/util_factory.py
+++ b/python/paddle/fleet/base/util_factory.py
@@ -18,11 +18,26 @@

 __all__ = ['UtilBase']

+import numpy as np
+import os
+
+import subprocess
+from paddle.fluid import core
+from collections import OrderedDict
+import paddle.fluid as fluid
+from google.protobuf import text_format
+from paddle.fluid import debugger
+from paddle.fluid.framework import Program
+from paddle.fluid.proto import framework_pb2
+from ..utils.fs import FS, LocalFS, HDFSClient
+

 class UtilFactory(object):
-    def _create_util(self, context):
+    def _create_util(self, context=None):
        util = UtilBase()
+        if context is not None and "valid_strategy" in context:
            util._set_strategy(context["valid_strategy"])
+        if context is not None and "role_maker" in context:
            util._set_role_maker(context["role_maker"])
        return util

@@ -38,43 +53,390 @@ class UtilBase(object):
    def _set_role_maker(self, role_maker):
        self.role_maker = role_maker

-    '''
    def set_file_system(self, fs_client):
+        assert isinstance(
+            fs_client,
+            FS), "fs_client must be the instance of paddle.fleet.utils.FS"
        self.fs_client = fs_client

-    def broadcast(self):
-        pass
+    def __check_comm_world(self, comm_world="worker"):
+        if not self.role_maker._role_is_generated:
+            self.role_maker.generate_role()

-    def all_gather(self):
-        pass
+        _comm_world = None
+        comm_world_upper = comm_world.upper()
+        if comm_world_upper == "WORKER":
+            if not self.role_maker.is_worker():
+                print(
+                    "warning: current role is not worker in collective_func(comm_world=\"worker\")"
+                )
+            _comm_world = self.role_maker._node_type_comm
+        elif comm_world_upper == "SERVER":
+            if not self.role_maker.is_server():
+                print(
+                    "warning: current role is not server in collective_func(comm_world=\"server\")"
+                )
+            _comm_world = self.role_maker._node_type_comm
+        elif comm_world_upper == "ALL":
+            _comm_world = self.role_maker._all_comm
+        else:
+            raise ValueError(
+                "not support comm_world, please choose one from [worker, server, all]"
+            )

-    def all_reduce(self):
-        pass
+        return _comm_world

-    def reduce_scatter(self):
+    def all_reduce(self, input, mode, comm_world="worker"):
+        _comm_world = self.__check_comm_world(comm_world)
+        return self.role_maker._all_reduce(_comm_world, input, mode)
+
+    def barrier(self, comm_world="worker"):
+        _comm_world = self.__check_comm_world(comm_world)
+        self.role_maker._barrier(_comm_world)
+
+    def all_gather(self, input, comm_world="worker"):
+        _comm_world = self.__check_comm_world(comm_world)
+        return self.role_maker._all_gather(_comm_world, input)
+
+    def broadcast(self):
        pass

-    def reduce(self):
+    def scatter(self):
        pass

    def get_file_shard(self, files):
-        pass
+        """
+        split files before distributed training,
+        example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
+                   0 gets [a, b, c] and trainer 1 gets [d, e].
+        example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
+                   [a], trainer 1 gets [b],  trainer 2 gets []

-    def feed_gen(self, batch_size, feed_vars_dims, feeded_vars_filelist):
-        pass
+        Args:
+            files(list): file list need to be read.

-    def save_program(program, output_dir):
-        pass
+        Returns:
+            list: files belongs to this worker.
+        """
+        if not isinstance(files, list):
+            raise TypeError("files should be a list of file need to be read.")

-    def load_program(input_dir):
-        pass
+        trainer_id = self.role_maker.worker_index()
+        trainers = self.role_maker.worker_num()

-    def load_var():
-        pass
+        remainder = len(files) % trainers
+        blocksize = int(len(files) / trainers)

-    def save_var():
-        pass
+        blocks = [blocksize] * trainers
+        for i in range(remainder):
+            blocks[i] += 1

-    def print_on_rank(self):
-        pass
-    '''
+        trainer_files = [[]] * trainers
+        begin = 0
+        for i in range(trainers):
+            trainer_files[i] = files[begin:begin + blocks[i]]
+            begin += blocks[i]
+
+        return trainer_files[trainer_id]
+
+    def print_on_rank(self, message, rank_id):
+        if self.role_maker.worker_index() != rank_id:
+            return
+        print(message)
+
+    def _save_program(self, program, model_filename='__model__', is_text=False):
+        if is_text:
+            with open(model_filename, "w") as f:
+                f.write(str(program))
+        else:
+            with open(model_filename, "wb") as f:
+                f.write(program.desc.serialize_to_string())
+
+    def _load_program(self, path, is_text):
+        def load_program_binary(path):
+            """load program from binary string file"""
+            with open(path, "rb") as f:
+                program_desc_str = f.read()
+            return Program.parse_from_string(program_desc_str)
+
+        def load_program_text(path):
+            """load program from human-readable text file"""
+            with open(path, "r") as f:
+                program_desc_text = f.read()
+
+            prog_desc = framework_pb2.ProgramDesc()
+            text_format.Merge(program_desc_text, prog_desc)
+            return Program.parse_from_string(prog_desc.SerializeToString())
+
+        if is_text:
+            return load_program_text(path)
+        else:
+            return load_program_binary(path)
+
+    def _program_type_trans(self, prog_dir, prog_fn, is_text):
+        prog = self._load_program(os.path.join(prog_dir, prog_fn), is_text)
+        prog_out_fn = prog_fn + ".bin" if is_text else prog_fn + ".pbtxt"
+        self._save_program(prog,
+                           os.path.join(prog_dir, prog_out_fn), 1 - is_text)
+        return prog_out_fn
+
+    def _visualize_graphviz(self, program, output_dir, output_filename):
+        block = program.global_block()
+        dot_path = os.path.join(output_dir, output_filename + '.dot')
+        pdf_path = os.path.join(output_dir, output_filename + '.pdf')
+        debugger.draw_block_graphviz(block, path=dot_path)
+        cmd = ["dot", "-Tpdf", dot_path, "-o", pdf_path]
+        p = subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        p.wait()
+
+    def _proto_check(self, config):
+        train_prog = self._load_program(config.train_prog_path,
+                                        config.is_text_train_program)
+        pruned_prog = self._load_program(config.pruned_prog_path,
+                                         config.is_text_pruned_program)
+
+        is_match = True
+
+        pruned_vars = [(v.name, v) for v in pruned_prog.list_vars()
+                       if fluid.io.is_persistable(v)]
+        pruned_vars = OrderedDict(pruned_vars)
+        pruned_vars_name = [name for name in pruned_vars]
+        print("persistable vars in pruned program: {}".format(pruned_vars_name))
+
+        # feed and fetch op is added in pruned program when pruning, not need to be found in train program
+        feed_fetch_type_list = [
+            core.VarDesc.VarType.FEED_MINIBATCH, core.VarDesc.VarType.FETCH_LIST
+        ]
+
+        for var_name in pruned_vars:
+            var = pruned_vars[var_name]
+            # feed and fetch op is added in pruned program when pruning, not need to be found in train program
+            if var.type in feed_fetch_type_list:
+                break
+            try:
+                train_prog_var = train_prog.global_block().var(var_name)
+            except ValueError as e:
+                print(
+                    "Not find variable '%s' in train program. please check pruning."
+                    % var_name)
+                is_match = False
+                continue
+            if var.shape != train_prog_var.shape or var.dtype != train_prog_var.dtype:
+                print(
+                    "variable: {} not match. in pruned program shape: {} dtype:{}, in train program shape: {} dtype: {}".
+                    format(var_name, var.shape, var.dtype, train_prog_var.shape,
+                           train_prog_var.dtype))
+                is_match = False
+        return is_match
+
+    def _params_check(self, config):
+        def feed_gen(batch_size, feeded_vars_dims, feeded_vars_filelist):
+            def reader(batch_size, fn, dim):
+                data = []
+                if isinstance(dim, list) or isinstance(dim, tuple):
+                    shape = list(dim)
+                    _temp = 1
+                    for x in dim:
+                        _temp = _temp * x
+                    dim = _temp
+                else:
+                    shape = [dim]
+
+                shape = [batch_size] + shape
+                dim = dim * batch_size
+
+                for line in open(fn, 'r'):
+                    fields = line.strip().split(' ')
+                    fields = [float(d) for d in fields]
+                    while len(fields) >= dim:
+                        tmp = fields[:dim]
+                        fields = fields[dim:]
+                        data.append(np.array(tmp).reshape(shape))
+                return data
+
+            batch_feed = []
+            for i, fn in enumerate(feeded_vars_filelist):
+                batch_feed.append(reader(batch_size, fn, feeded_vars_dims[i]))
+            return batch_feed
+
+        prog = self._load_program(
+            os.path.join(config.dump_model_dir, config.dump_program_filename),
+            config.is_text_dump_program)
+        if config.is_text_dump_program:
+            model_filename = self._program_type_trans(
+                config.dump_model_dir, config.dump_program_filename,
+                config.is_text_dump_program)
+
+        saved_params = [
+            v for v in prog.list_vars() if fluid.io.is_persistable(v)
+        ]
+        print("persistable vars in dump program: {}".format(
+            [v.name for v in saved_params]))
+
+        def check_not_expected_ops(prog, not_expected_op_types):
+            op_types_set = set()
+            for op in prog.global_block().ops:
+                if op.type in not_expected_op_types and op.type not in op_types_set:
+                    op_types_set.add(op.type)
+            return op_types_set
+
+        not_expected_op_types = check_not_expected_ops(prog, ["lookup_table"])
+        if len(not_expected_op_types) > 0:
+            print(
+                "find op type '{}' in program, please check if your program is pruned correctly !".
+                format(list(not_expected_op_types)))
+            return False
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            inference_program, feed_target_names, fetch_targets = \
+                fluid.io.load_inference_model(config.dump_model_dir, exe, model_filename=model_filename,
+                                            params_filename=config.save_params_filename)
+
+            # check program vars and saved vars shape
+            orig_para_shape = {
+                each_var.name: tuple(each_var.desc.shape())
+                for each_var in saved_params
+            }
+            for each_var in saved_params:
+                var_temp = fluid.global_scope().find_var(each_var.name)
+                assert var_temp != None, "can't not find var: " + each_var.name
+                new_shape = (np.array(var_temp.get_tensor())).shape
+                assert each_var.name in orig_para_shape, each_var.name + "MUST in var list"
+                orig_shape = orig_para_shape.get(each_var.name)
+                if new_shape != orig_shape:
+                    raise RuntimeError(
+                        "Shape not matching: the Program requires a parameter with a shape of ({}), "
+                        "while the loaded parameter (namely [ {} ]) has a shape of  ({}).".
+                        format(orig_shape, each_var.name, new_shape))
+
+            # check feed/fetch vars in program and config
+            feed_config = config.feed_config
+            fetch_config = config.fetch_config
+            fetch_targets_names = [v.name for v in fetch_targets]
+            if not feed_target_names:
+                print("warning! no feed targets in program.")
+            if not fetch_targets_names:
+                print("warning! no fetch targets in program.")
+            fetch_list = fetch_targets
+            feed_name_list = feed_target_names
+            if feed_config.feeded_vars_names is not None and feed_target_names != feed_config.feeded_vars_names:
+                print(
+                    "warning! feed vars in program and config are diff: feed in program: {}. feed in config {}.".
+                    format(feed_target_names, feed_config.feeded_vars_names))
+                feed_name_list = feed_config.feeded_vars_names
+                # remove feed op in inference_program. new feed op will be added in exe.run
+                global_block = inference_program.global_block()
+                need_to_remove_op_index = []
+                for i, op in enumerate(global_block.ops):
+                    op.desc.set_is_target(False)
+                    if op.type == "feed":  # only remove feed op here
+                        need_to_remove_op_index.append(i)
+                for index in need_to_remove_op_index[::-1]:
+                    global_block._remove_op(index)
+            if fetch_config.fetch_vars_names is not None and fetch_targets_names != fetch_config.fetch_vars_names:
+                print(
+                    "warning! fetch vars in program and config are diff: fetch in program: {}. fetch in config {}.".
+                    format(fetch_targets_names, fetch_config.fetch_vars_names))
+                fetch_list = [
+                    inference_program.global_block().var(i)
+                    for i in fetch_config.fetch_vars_names
+                ]
+                # remove fetch op in inference_program. new fetch op will be added in exe.run
+                global_block = inference_program.global_block()
+                need_to_remove_op_index = []
+                for i, op in enumerate(global_block.ops):
+                    op.desc.set_is_target(False)
+                    if op.type == "fetch":  # only remove fetch op here
+                        need_to_remove_op_index.append(i)
+                for index in need_to_remove_op_index[::-1]:
+                    global_block._remove_op(index)
+
+            # if fetch_list have lod tensor
+            return_numpy = all([v.lod_level == 0 for v in fetch_list])
+
+            # try dump fetch_targets
+            feed_tensors = []
+            assert len(feed_config.feeded_vars_names) == len(
+                feed_config.feeded_vars_dims) == len(
+                    feed_config.feeded_vars_types)
+            # check program vars and feed tensor shape in config
+            for i in range(len(feed_config.feeded_vars_names)):
+                var = inference_program.global_block().var(
+                    feed_config.feeded_vars_names[i])
+                if not isinstance(feed_config.feeded_vars_dims[i],
+                                  (list, tuple)):
+                    tensor_shape = (feed_config.feeded_vars_dims[i], )
+                else:
+                    tensor_shape = tuple(feed_config.feeded_vars_dims[i])
+                feed_config.feeded_vars_dims[i] = tensor_shape
+                var_shape = var.shape[1:]
+                if tensor_shape != var_shape:
+                    raise RuntimeError(
+                        "feed variable '{}' shape not match. infer program  shape: {}. feed tensor shape: {}".
+                        format(feed_config.feeded_vars_names[i], var_shape,
+                               tensor_shape))
+
+            if not feed_config.feeded_vars_filelist:
+                print("generate random feed vars.")
+                for i in range(len(feed_config.feeded_vars_names)):
+                    var = inference_program.global_block().var(
+                        feed_config.feeded_vars_names[i])
+                    # create fake feed tensor. if lod_level > 1, should create_lod_tensor()
+                    if var.lod_level == 0:
+                        feed_tensors.append(
+                            np.array(
+                                np.random.random(
+                                    tuple([config.batch_size] + list(
+                                        feed_config.feeded_vars_dims[i]))),
+                                dtype=feed_config.feeded_vars_types[i]))
+                    elif var.lod_level == 1:
+                        t = np.array(
+                            np.random.random(
+                                tuple([config.batch_size] + list(
+                                    feed_config.feeded_vars_dims[i]))),
+                            dtype=feed_config.feeded_vars_types[i])
+                        feed_tensors.append(
+                            fluid.create_lod_tensor(t, [[1] * config.batch_size
+                                                        ], place))
+                    else:
+                        raise RuntimeError(
+                            "vars with lod_level >= 2 is not supported now in this infer program check tool."
+                        )
+                results = exe.run(inference_program,
+                                  feed={
+                                      name: feed_tensors[i]
+                                      for i, name in enumerate(feed_name_list)
+                                  },
+                                  fetch_list=fetch_list,
+                                  return_numpy=return_numpy)
+            else:
+                print("load feed vars from files: {}.".format(
+                    feed_config.feeded_vars_filelist))
+                feed_vars = [
+                    inference_program.global_block().var(
+                        feed_config.feeded_vars_names[i])
+                    for i in range(len(feed_config.feeded_vars_names))
+                ]
+                feeder = fluid.DataFeeder(feed_list=feed_vars, place=place)
+                batch_feed = feed_gen(config.batch_size,
+                                      feed_config.feeded_vars_dims,
+                                      feed_config.feeded_vars_filelist)
+                slots = [batch_feed]
+                results = exe.run(inference_program,
+                                  feed=feeder.feed(slots),
+                                  fetch_list=fetch_list,
+                                  return_numpy=return_numpy)
+            for i, v in enumerate(fetch_list):
+                print("fetch_targets name: %s" % v.name)
+                print("fetch_targets: {}".format(results[i]))
+            return results
+
+
+fleet_util = UtilFactory()._create_util(None)
--- a/python/paddle/fleet/dataset/__init__.py
+++ b/python/paddle/fleet/dataset/__init__.py
@@ -10,3 +10,5 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
+
+from .dataset import *
--- a/python/paddle/fleet/dataset/dataset.py
+++ b/python/paddle/fleet/dataset/dataset.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This is definition of dataset class, which is high performance IO."""
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.proto import data_feed_pb2
+from google.protobuf import text_format
+import paddle.fluid.core as core
+
+
+class DatasetFactory(object):
+    """
+    DatasetFactory is a factory which create dataset by its name,
+    you can create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
+    the default is "QueueDataset".
+
+    Example:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+
+    """
+
+    def __init__(self):
+        """ Init. """
+        pass
+
+    def create_dataset(self, datafeed_class="QueueDataset"):
+        """
+        Create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
+        the default is "QueueDataset".
+
+        Args:
+            datafeed_class(str): datafeed class name, QueueDataset or InMemoryDataset.
+                                 Default is QueueDataset.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+
+        """
+        try:
+            dataset = globals()[datafeed_class]()
+            return dataset
+        except:
+            raise ValueError("datafeed class %s does not exist" %
+                             datafeed_class)
+
+
+class DatasetBase(object):
+    """ Base dataset class. """
+
+    def __init__(self):
+        """ Init. """
+        # define class name here
+        # to decide whether we need create in memory instance
+        self.proto_desc = data_feed_pb2.DataFeedDesc()
+        self.proto_desc.pipe_command = "cat"
+        self.dataset = core.Dataset("MultiSlotDataset")
+        self.thread_num = 1
+        self.filelist = []
+
+    def set_pipe_command(self, pipe_command):
+        """
+        Set pipe command of current dataset
+        A pipe command is a UNIX pipeline command that can be used only
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_pipe_command("python my_script.py")
+
+        Args:
+            pipe_command(str): pipe command
+
+        """
+        self.proto_desc.pipe_command = pipe_command
+
+    def set_rank_offset(self, rank_offset):
+        """
+        Set rank_offset for merge_pv. It set the message of Pv.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_rank_offset("rank_offset")
+
+        Args:
+            rank_offset(str): rank_offset's name
+
+        """
+        self.proto_desc.rank_offset = rank_offset
+
+    def set_fea_eval(self, record_candidate_size, fea_eval=True):
+        """
+        set fea eval mode for slots shuffle to debug the importance level of
+        slots(features), fea_eval need to be set True for slots shuffle.
+        
+        Args:
+            record_candidate_size(int): size of instances candidate to shuffle 
+                                        one slot
+            fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
+                            default is True.
+            
+        Examples:
+            .. code-block:: python
+
+            import paddle.fluid as fluid
+            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset.set_fea_eval(1000000, True)
+
+        """
+        if fea_eval:
+            self.dataset.set_fea_eval(fea_eval, record_candidate_size)
+        self.fea_eval = fea_eval
+
+    def slots_shuffle(self, slots):
+        """
+        Slots Shuffle 
+        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        in sparse feature with large scale of instances. To compare the metric, i.e.
+        auc while doing slots shuffle on one or several slots with baseline to 
+        evaluate the importance level of slots(features).
+        
+        Args:
+            slots(list[string]): the set of slots(string) to do slots shuffle.
+
+        Examples:
+            import paddle.fluid as fluid
+            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset.set_merge_by_lineid()
+            #suppose there is a slot 0
+            dataset.slots_shuffle(['0'])
+        """
+        if self.fea_eval:
+            slots_set = set(slots)
+            self.dataset.slots_shuffle(slots_set)
+
+    def set_batch_size(self, batch_size):
+        """
+        Set batch size. Will be effective during training
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_batch_size(128)
+
+        Args:
+            batch_size(int): batch size
+
+        """
+        self.proto_desc.batch_size = batch_size
+
+    def set_pv_batch_size(self, pv_batch_size):
+        """
+        Set pv batch size. It will be effective during enable_pv_merge
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_pv_batch(128)
+        Args:
+            pv_batch_size(int): pv batch size
+
+        """
+        self.proto_desc.pv_batch_size = pv_batch_size
+
+    def set_thread(self, thread_num):
+        """
+        Set thread num, it is the num of readers.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+               dataset.set_thread(12)
+
+        Args:
+            thread_num(int): thread num
+        """
+        self.dataset.set_thread_num(thread_num)
+        self.thread_num = thread_num
+
+    def set_filelist(self, filelist):
+        """
+        Set file list in current worker.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_filelist(['a.txt', 'b.txt'])
+
+        Args:
+            filelist(list): file list
+        """
+        self.dataset.set_filelist(filelist)
+        self.filelist = filelist
+
+    def set_input_type(self, input_type):
+        self.proto_desc.input_type = input_type
+
+    def set_use_var(self, var_list):
+        """
+        Set Variables which you will use.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_use_var([data, label])
+
+        Args:
+            var_list(list): variable list
+        """
+        multi_slot = self.proto_desc.multi_slot_desc
+        for var in var_list:
+            slot_var = multi_slot.slots.add()
+            slot_var.is_used = True
+            slot_var.name = var.name
+            if var.lod_level == 0:
+                slot_var.is_dense = True
+                slot_var.shape.extend(var.shape)
+            if var.dtype == core.VarDesc.VarType.FP32:
+                slot_var.type = "float"
+            elif var.dtype == core.VarDesc.VarType.INT64:
+                slot_var.type = "uint64"
+            else:
+                raise ValueError(
+                    "Currently, fluid.dataset only supports dtype=float32 and dtype=int64"
+                )
+
+    def set_hdfs_config(self, fs_name, fs_ugi):
+        """
+        Set hdfs config: fs name ad ugi
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
+
+        Args:
+            fs_name(str): fs name
+            fs_ugi(str): fs ugi
+        """
+        self.dataset.set_hdfs_config(fs_name, fs_ugi)
+
+    def set_download_cmd(self, download_cmd):
+        """
+        Set customized download cmd: download_cmd
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_download_cmd("./read_from_afs")
+
+        Args:
+            download_cmd(str): customized download command
+        """
+        self.dataset.set_download_cmd(download_cmd)
+
+    def _prepare_to_run(self):
+        """
+        Set data_feed_desc before load or shuffle,
+        user no need to call this function.
+        """
+        if self.thread_num > len(self.filelist):
+            self.thread_num = len(self.filelist)
+        self.dataset.set_thread_num(self.thread_num)
+        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.create_readers()
+
+    def _finish_to_run(self):
+        self.dataset.destroy_readers()
+
+    def desc(self):
+        """
+        Returns a protobuf message for this DataFeedDesc
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              print(dataset.desc())
+
+        Returns:
+            A string message
+        """
+        return text_format.MessageToString(self.proto_desc)
+
+    def _dynamic_adjust_before_train(self, thread_num):
+        pass
+
+    def _dynamic_adjust_after_train(self):
+        pass
+
+
+class InMemoryDataset(DatasetBase):
+    """
+    InMemoryDataset, it will load data into memory
+    and shuffle data before training.
+    This class should be created by DatasetFactory
+
+    Example:
+        dataset = paddle.fluid.DatasetFactory().create_dataset("InMemoryDataset")
+    """
+
+    def __init__(self):
+        """ Init. """
+        super(InMemoryDataset, self).__init__()
+        self.proto_desc.name = "MultiSlotInMemoryDataFeed"
+        self.fleet_send_batch_size = None
+        self.is_user_set_queue_num = False
+        self.queue_num = None
+        self.parse_ins_id = False
+        self.parse_content = False
+        self.parse_logkey = False
+        self.merge_by_sid = True
+        self.enable_pv_merge = False
+        self.merge_by_lineid = False
+        self.fleet_send_sleep_seconds = None
+
+    def set_feed_type(self, data_feed_type):
+        """
+        Set data_feed_desc
+        """
+        self.proto_desc.name = data_feed_type
+
+    def _prepare_to_run(self):
+        """
+        Set data_feed_desc before load or shuffle,
+        user no need to call this function.
+        """
+        if self.thread_num <= 0:
+            self.thread_num = 1
+        self.dataset.set_thread_num(self.thread_num)
+        if self.queue_num is None:
+            self.queue_num = self.thread_num
+        self.dataset.set_queue_num(self.queue_num)
+        self.dataset.set_parse_ins_id(self.parse_ins_id)
+        self.dataset.set_parse_content(self.parse_content)
+        self.dataset.set_parse_logkey(self.parse_logkey)
+        self.dataset.set_merge_by_sid(self.merge_by_sid)
+        self.dataset.set_enable_pv_merge(self.enable_pv_merge)
+        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.create_channel()
+        self.dataset.create_readers()
+
+    def _dynamic_adjust_before_train(self, thread_num):
+        if not self.is_user_set_queue_num:
+            self.dataset.dynamic_adjust_channel_num(thread_num, False)
+        self.dataset.dynamic_adjust_readers_num(thread_num)
+
+    def _dynamic_adjust_after_train(self):
+        if not self.is_user_set_queue_num:
+            self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
+        self.dataset.dynamic_adjust_readers_num(self.thread_num)
+
+    def set_queue_num(self, queue_num):
+        """
+        Set Dataset output queue num, training threads get data from queues
+
+        Args:
+            queue_num(int): dataset output queue num
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_queue_num(12)
+
+        """
+        self.is_user_set_queue_num = True
+        self.queue_num = queue_num
+
+    def set_parse_ins_id(self, parse_ins_id):
+        """
+        Set id Dataset need to parse insid
+
+        Args:
+            parse_ins_id(bool): if parse ins_id or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_parse_ins_id(True)
+
+        """
+        self.parse_ins_id = parse_ins_id
+
+    def set_parse_content(self, parse_content):
+        """
+        Set if Dataset need to parse content
+
+        Args:
+            parse_content(bool): if parse content or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_parse_content(True)
+
+        """
+        self.parse_content = parse_content
+
+    def set_parse_logkey(self, parse_logkey):
+        """
+        Set if Dataset need to parse logkey
+
+        Args:
+            parse_content(bool): if parse logkey or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_parse_logkey(True)
+
+        """
+        self.parse_logkey = parse_logkey
+
+    def set_merge_by_sid(self, merge_by_sid):
+        """
+        Set if Dataset need to merge sid. If not, one ins means one Pv.
+
+        Args:
+            merge_by_sid(bool): if merge sid or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_merge_by_sid(True)
+
+        """
+        self.merge_by_sid = merge_by_sid
+
+    def set_enable_pv_merge(self, enable_pv_merge):
+        """
+        Set if Dataset need to merge pv.
+
+        Args:
+            enable_pv_merge(bool): if enable_pv_merge or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_enable_pv_merge(True)
+
+        """
+        self.enable_pv_merge = enable_pv_merge
+
+    def preprocess_instance(self):
+        """
+        Merge pv instance and convey it from input_channel to input_pv_channel. 
+        It will be effective when enable_pv_merge_ is True.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.preprocess_instance()
+
+        """
+        self.dataset.preprocess_instance()
+
+    def set_current_phase(self, current_phase):
+        """
+        Set current phase in train. It is useful for untest.
+        current_phase : 1 for join, 0 for update.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.set_current_phase(1)
+
+        """
+        self.dataset.set_current_phase(current_phase)
+
+    def postprocess_instance(self):
+        """
+        Divide pv instance and convey it to input_channel.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.preprocess_instance()
+              exe.train_from_dataset(dataset)
+              dataset.postprocess_instance()
+
+        """
+        self.dataset.postprocess_instance()
+
+    def set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
+        """
+        Set fleet send batch size, default is 1024
+
+        Args:
+            fleet_send_batch_size(int): fleet send batch size
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_fleet_send_batch_size(800)
+
+        """
+        self.fleet_send_batch_size = fleet_send_batch_size
+
+    def set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
+        """
+        Set fleet send sleep time, default is 0
+
+        Args:
+            fleet_send_sleep_seconds(int): fleet send sleep time
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_fleet_send_sleep_seconds(2)
+
+        """
+        self.fleet_send_sleep_seconds = fleet_send_sleep_seconds
+
+    def set_merge_by_lineid(self, merge_size=2):
+        """
+        Set merge by line id, instances of same line id will be merged after
+        shuffle, you should parse line id in data generator.
+
+        Args:
+            merge_size(int): ins size to merge. default is 2.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_merge_by_lineid()
+
+        """
+        self.dataset.set_merge_by_lineid(merge_size)
+        self.merge_by_lineid = True
+        self.parse_ins_id = True
+
+    def set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
+        self.dataset.set_generate_unique_feasigns(generate_uni_feasigns)
+        self.gen_uni_feasigns = generate_uni_feasigns
+        self.local_shard_num = shard_num
+
+    def generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num,
+                                     consume_thread_num, shard_num):
+        self.dataset.generate_local_tables_unlock(
+            table_id, fea_dim, read_thread_num, consume_thread_num, shard_num)
+
+    def load_into_memory(self):
+        """
+        Load data into memory
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+        """
+        self._prepare_to_run()
+        self.dataset.load_into_memory()
+
+    def preload_into_memory(self, thread_num=None):
+        """
+        Load data into memory in async mode
+
+        Args:
+            thread_num(int): preload thread num
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.preload_into_memory()
+              dataset.wait_preload_done()
+        """
+        self._prepare_to_run()
+        if thread_num is None:
+            thread_num = self.thread_num
+        self.dataset.set_preload_thread_num(thread_num)
+        self.dataset.create_preload_readers()
+        self.dataset.preload_into_memory()
+
+    def wait_preload_done(self):
+        """
+        Wait preload_into_memory done
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.preload_into_memory()
+              dataset.wait_preload_done()
+        """
+        self.dataset.wait_preload_done()
+        self.dataset.destroy_preload_readers()
+
+    def local_shuffle(self):
+        """
+        Local shuffle
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.local_shuffle()
+        """
+        self.dataset.local_shuffle()
+
+    def global_shuffle(self, fleet=None, thread_num=12):
+        """
+        Global shuffle.
+        Global shuffle can be used only in distributed mode. i.e. multiple
+        processes on single machine or multiple machines training together.
+        If you run in distributed mode, you should pass fleet instead of None.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.global_shuffle(fleet)
+
+        Args:
+            fleet(Fleet): fleet singleton. Default None.
+            thread_num(int): shuffle thread num. Default is 12.
+
+        """
+        trainer_num = 1
+        if fleet is not None:
+            fleet._role_maker.barrier_worker()
+            trainer_num = fleet.worker_num()
+        if self.fleet_send_batch_size is None:
+            self.fleet_send_batch_size = 1024
+        if self.fleet_send_sleep_seconds is None:
+            self.fleet_send_sleep_seconds = 0
+        self.dataset.register_client2client_msg_handler()
+        self.dataset.set_trainer_num(trainer_num)
+        self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size)
+        self.dataset.set_fleet_send_sleep_seconds(self.fleet_send_sleep_seconds)
+        if fleet is not None:
+            fleet._role_maker.barrier_worker()
+        self.dataset.global_shuffle(thread_num)
+        if fleet is not None:
+            fleet._role_maker.barrier_worker()
+        if self.merge_by_lineid:
+            self.dataset.merge_by_lineid()
+        if fleet is not None:
+            fleet._role_maker.barrier_worker()
+
+    def release_memory(self):
+        """
+        :api_attr: Static Graph
+        
+        Release InMemoryDataset memory data, when data will not be used again.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.global_shuffle(fleet)
+              exe = fluid.Executor(fluid.CPUPlace())
+              exe.run(fluid.default_startup_program())
+              exe.train_from_dataset(fluid.default_main_program(), dataset)
+              dataset.release_memory()
+
+        """
+        self.dataset.release_memory()
+
+    def get_pv_data_size(self):
+        """
+        Get memory data size of Pv, user can call this function to know the pv num
+        of ins in all workers after load into memory.
+
+        Note:
+            This function may cause bad performance, because it has barrier
+
+        Returns:
+            The size of memory pv data.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              print dataset.get_pv_data_size()
+
+        """
+        return self.dataset.get_pv_data_size()
+
+    def get_memory_data_size(self, fleet=None):
+        """
+        Get memory data size, user can call this function to know the num
+        of ins in all workers after load into memory.
+
+        Note:
+            This function may cause bad performance, because it has barrier
+
+        Args:
+            fleet(Fleet): Fleet Object.
+
+        Returns:
+            The size of memory data.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              print dataset.get_memory_data_size(fleet)
+
+        """
+        import numpy as np
+        local_data_size = self.dataset.get_memory_data_size()
+        local_data_size = np.array([local_data_size])
+        if fleet is not None:
+            global_data_size = local_data_size * 0
+            fleet._role_maker.all_reduce_worker(local_data_size,
+                                                global_data_size)
+            return global_data_size[0]
+        return local_data_size[0]
+
+    def get_shuffle_data_size(self, fleet=None):
+        """
+        Get shuffle data size, user can call this function to know the num
+        of ins in all workers after local/global shuffle.
+
+        Note:
+            This function may cause bad performance to local shuffle,
+            because it has barrier. It does not affect global shuffle.
+
+        Args:
+            fleet(Fleet): Fleet Object.
+
+        Returns:
+            The size of shuffle data.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.global_shuffle(fleet)
+              print dataset.get_shuffle_data_size(fleet)
+
+        """
+        import numpy as np
+        local_data_size = self.dataset.get_shuffle_data_size()
+        local_data_size = np.array([local_data_size])
+        if fleet is not None:
+            global_data_size = local_data_size * 0
+            fleet._role_maker.all_reduce_worker(local_data_size,
+                                                global_data_size)
+            return global_data_size[0]
+        return local_data_size[0]
+
+
+class QueueDataset(DatasetBase):
+    """
+    QueueDataset, it will process data streamly.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+
+    """
+
+    def __init__(self):
+        """
+        Initialize QueueDataset
+        This class should be created by DatasetFactory
+        """
+        super(QueueDataset, self).__init__()
+        self.proto_desc.name = "MultiSlotDataFeed"
+
+    def _prepare_to_run(self):
+        """
+        Set data_feed_desc/thread num/filelist before run,
+        user no need to call this function.
+        """
+        if self.thread_num > len(self.filelist):
+            self.thread_num = len(self.filelist)
+        if self.thread_num == 0:
+            self.thread_num = 1
+        self.dataset.set_thread_num(self.thread_num)
+        self.dataset.set_filelist(self.filelist)
+        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.create_readers()
+
+    def local_shuffle(self):
+        """
+        Local shuffle data.
+
+        Local shuffle is not supported in QueueDataset
+        NotImplementedError will be raised
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+              dataset.local_shuffle()
+
+        Raises:
+            NotImplementedError: QueueDataset does not support local shuffle
+
+        """
+        raise NotImplementedError(
+            "QueueDataset does not support local shuffle, "
+            "please use InMemoryDataset for local_shuffle")
+
+    def global_shuffle(self, fleet=None):
+        """
+        Global shuffle data.
+
+        Global shuffle is not supported in QueueDataset
+        NotImplementedError will be raised
+
+        Args:
+            fleet(Fleet): fleet singleton. Default None.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+              dataset.global_shuffle(fleet)
+
+        Raises:
+            NotImplementedError: QueueDataset does not support global shuffle
+
+        """
+        raise NotImplementedError(
+            "QueueDataset does not support global shuffle, "
+            "please use InMemoryDataset for global_shuffle")
+
+
+class FileInstantDataset(DatasetBase):
+    """
+    FileInstantDataset, it will process data streamly.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          dataset = fluid.DatasetFactory.create_dataset("FileInstantDataset")
+    """
+
+    def __init__(self):
+        """
+        Initialize FileInstantDataset
+        This class should be created by DatasetFactory
+        """
+        super(FileInstantDataset, self).__init__()
+        self.proto_desc.name = "MultiSlotFileInstantDataFeed"
+
+    def local_shuffle(self):
+        """
+        Local shuffle
+        FileInstantDataset does not support local shuffle
+        """
+        raise NotImplementedError(
+            "FileInstantDataset does not support local shuffle, "
+            "please use InMemoryDataset for local_shuffle")
+
+    def global_shuffle(self, fleet=None):
+        """
+        Global shuffle
+        FileInstantDataset does not support global shuffle
+        """
+        raise NotImplementedError(
+            "FileInstantDataset does not support global shuffle, "
+            "please use InMemoryDataset for global_shuffle")
+
+
+class BoxPSDataset(InMemoryDataset):
+    """
+    BoxPSDataset: derived from InMemoryDataset.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+    """
+
+    def __init__(self):
+        """
+        Initialize BoxPSDataset
+        This class should be created by DatasetFactory
+        """
+        super(BoxPSDataset, self).__init__()
+        self.boxps = core.BoxPS(self.dataset)
+        self.proto_desc.name = "PaddleBoxDataFeed"
+
+    def set_date(self, date):
+        """
+        Workaround for date
+        """
+        year = int(date[:4])
+        month = int(date[4:6])
+        day = int(date[6:])
+        self.boxps.set_date(year, month, day)
+
+    def begin_pass(self):
+        """
+        Begin Pass
+        Notify BoxPS to load sparse parameters of next pass to GPU Memory 
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              dataset.begin_pass()
+        """
+        self.boxps.begin_pass()
+
+    def end_pass(self, need_save_delta):
+        """
+        End Pass
+        Notify BoxPS that current pass ended 
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              dataset.end_pass(True)
+        """
+        self.boxps.end_pass(need_save_delta)
+
+    def wait_preload_done(self):
+        """
+        Wait async preload done
+        Wait Until Feed Pass Done
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.preload_into_memory()
+              dataset.wait_preload_done()
+        """
+        self.boxps.wait_feed_pass_done()
+
+    def load_into_memory(self):
+        """
+        Load next pass into memory and notify boxps to fetch its emb from SSD
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+	    """
+        self._prepare_to_run()
+        self.boxps.load_into_memory()
+
+    def preload_into_memory(self):
+        """
+        Begin async preload next pass while current pass may be training
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.preload_into_memory()
+        """
+        self._prepare_to_run()
+        self.boxps.preload_into_memory()
+
+    def _dynamic_adjust_before_train(self, thread_num):
+        if not self.is_user_set_queue_num:
+            self.dataset.dynamic_adjust_channel_num(thread_num, True)
+        self.dataset.dynamic_adjust_readers_num(thread_num)
+
+    def _dynamic_adjust_after_train(self):
+        pass
+
+    def slots_shuffle(self, slots):
+        """
+        Slots Shuffle 
+        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        in sparse feature with large scale of instances. To compare the metric, i.e.
+        auc while doing slots shuffle on one or several slots with baseline to 
+        evaluate the importance level of slots(features).
+        
+        Args:
+            slots(list[string]): the set of slots(string) to do slots shuffle.
+
+        Examples:
+            import paddle.fluid as fluid
+            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset.set_merge_by_lineid()
+            #suppose there is a slot 0
+            dataset.slots_shuffle(['0'])
+        """
+        slots_set = set(slots)
+        self.boxps.slots_shuffle(slots_set)
--- a/python/paddle/fleet/utils/__init__.py
+++ b/python/paddle/fleet/utils/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .fs import *
+from .http_server import KVHandler, KVHTTPServer, KVServer
+
+__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
--- a/python/paddle/fleet/utils/fs.py
+++ b/python/paddle/fleet/utils/fs.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import subprocess
+import multiprocessing
+from datetime import datetime
+
+import re
+import copy
+import errno
+import time
+import logging
+import six
+import abc
+import paddle.fluid as fluid
+import functools
+
+from pathlib import PurePosixPath, Path
+import shutil
+
+__all__ = [
+    'FS', 'LocalFS', 'HDFSClient', 'ExecuteError', 'FSTimeOut',
+    'FSFileExistsError', 'FSFileNotExistsError'
+]
+
+
+class ExecuteError(Exception):
+    pass
+
+
+class FSFileExistsError(Exception):
+    pass
+
+
+class FSFileNotExistsError(Exception):
+    pass
+
+
+class FSTimeOut(Exception):
+    pass
+
+
+class FS(object):
+    @abc.abstractmethod
+    def ls_dir(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_file(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_dir(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_exist(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def upload(self, local_path, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def download(self, fs_path, local_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def mkdirs(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def delete(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def need_upload_download(self):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def rename(self, fs_src_path, fs_dst_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def mv(self, fs_src_path, fs_dst_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def upload_dir(self, local_dir, dest_dir):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def glob(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def stat(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def walk(self, fs_path):
+        raise NotImplementedError
+
+
+class LocalFS(FS):
+    def ls_dir(self, fs_path):
+        if not self.is_exist(fs_path):
+            return [], []
+
+        dirs = []
+        files = []
+        for f in os.listdir(fs_path):
+            if os.path.isdir(fs_path + "/" + f):
+                dirs.append(f)
+            else:
+                files.append(f)
+
+        return dirs, files
+
+    def mkdirs(self, fs_path):
+        assert not os.path.isfile(fs_path), "{} is already a file".format(
+            fs_path)
+        os.system("mkdir -p {}".format(fs_path))
+
+    def is_file(self, fs_path):
+        return os.path.isfile(fs_path)
+
+    def is_dir(self, fs_path):
+        return os.path.isdir(fs_path)
+
+    def is_exist(self, fs_path):
+        return os.path.exists(fs_path)
+
+    def _rmr(self, fs_path):
+        shutil.rmtree(fs_path)
+
+    def _rm(self, fs_path):
+        os.remove(fs_path)
+
+    def delete(self, fs_path):
+        if not self.is_exist(fs_path):
+            return
+
+        if os.path.isfile(fs_path):
+            return self._rm(fs_path)
+
+        return self._rmr(fs_path)
+
+    def rename(self, fs_src_path, fs_dst_path):
+        os.rename(fs_src_path, fs_dst_path)
+
+    def need_upload_download(self):
+        return False
+
+    def touch(self, fs_path):
+        return Path(fs_path).touch()
+
+    def mv(self, src_path, dst_path):
+        if not self.is_exist(src_path):
+            raise FSFileNotExistsError
+
+        if self.is_exist(dst_path):
+            raise FSFileExistsError
+
+        return self.rename(src_path, dst_path)
+
+
+"""HDFS Utils."""
+
+
+def _handle_errors(f):
+    def handler(*args, **kwargs):
+        start = time.time()
+        while True:
+            try:
+                return f(*args, **kwargs)
+            except ExecuteError as e:
+                o = args[0]
+                time_out = float(o._time_out) / 1000.0
+                inter = float(o._sleep_inter) / 1000.0
+                if time.time() - start >= time_out:
+                    raise FSTimeOut
+                time.sleep(inter)
+
+    return functools.wraps(f)(handler)
+
+
+class HDFSClient(FS):
+    def __init__(
+            self,
+            hadoop_home,
+            configs,
+            time_out=5 * 60 * 1000,  #ms
+            sleep_inter=1000):  #ms
+        # Raise exception if JAVA_HOME not exists.
+        java_home = os.environ["JAVA_HOME"]
+
+        self.pre_commands = []
+        hadoop_bin = '%s/bin/hadoop' % hadoop_home
+        self.pre_commands.append(hadoop_bin)
+        dfs = 'fs'
+        self.pre_commands.append(dfs)
+
+        if configs:
+            for k, v in six.iteritems(configs):
+                self.pre_commands.append('-D%s=%s' % (k, v))
+
+        self._time_out = time_out
+        self._sleep_inter = sleep_inter
+        self._base_cmd = " ".join(self.pre_commands)
+        self._bd_err_re = re.compile(
+            r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:')
+
+    def _run_cmd(self, cmd, redirect_stderr=False):
+        ret, output = fluid.core.shell_execute_cmd(cmd, 0, 0, redirect_stderr)
+        return int(ret), output.splitlines()
+
+    @_handle_errors
+    def ls_dir(self, fs_path):
+        """	
+        list directory under fs_path, and only give the pure name, not include the fs_path	
+        """
+        if not self.is_exist(fs_path):
+            return [], []
+
+        cmd = "{} -ls {}".format(self._base_cmd, fs_path)
+        ret, lines = self._run_cmd(cmd)
+
+        if ret != 0:
+            raise ExecuteError
+
+        dirs = []
+        files = []
+        for line in lines:
+            arr = line.split()
+            if len(arr) != 8:
+                continue
+
+            if fs_path not in arr[7]:
+                continue
+
+            p = PurePosixPath(arr[7])
+            if arr[0][0] == 'd':
+                dirs.append(p.name)
+            else:
+                files.append(p.name)
+
+        return dirs, files
+
+    def _test_match(self, lines):
+        for l in lines:
+            m = self._bd_err_re.match(l)
+            if m != None:
+                return m
+
+        return None
+
+    @_handle_errors
+    def is_dir(self, fs_path):
+        if not self.is_exist(fs_path):
+            return False
+
+        cmd = "{} -test -d {}".format(
+            self._base_cmd, fs_path, redirect_stderr=True)
+        ret, lines = self._run_cmd(cmd)
+        if ret:
+            # other error
+            if self._test_match(lines) != None:
+                raise ExecuteError
+
+            return False
+
+        return True
+
+    def is_file(self, fs_path):
+        if not self.is_exist(fs_path):
+            return False
+
+        return not self.is_dir(fs_path)
+
+    @_handle_errors
+    def is_exist(self, fs_path):
+        cmd = "{} -ls {} ".format(self._base_cmd, fs_path)
+        ret, out = self._run_cmd(cmd, redirect_stderr=True)
+        if ret != 0:
+            for l in out:
+                if "No such file or directory" in l:
+                    return False
+            raise ExecuteError
+
+        return True
+
+    @_handle_errors
+    def upload(self, local_path, fs_path):
+        if self.is_exist(fs_path):
+            raise FSFileExistsError
+
+        local = LocalFS()
+        if not local.is_exist(local_path):
+            raise FSFileNotExistsError
+
+        cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
+        ret, lines = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def download(self, fs_path, local_path):
+        if self.is_exist(local_path):
+            raise FSFileExistsError
+
+        if not self.is_exist(fs_path):
+            raise FSFileNotExistsError
+
+        cmd = "{} -get {} {}".format(self._base_cmd, fs_path, local_path)
+        ret, lines = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def mkdirs(self, fs_path):
+        if self.is_exist(fs_path):
+            return
+
+        cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
+        ret, lines = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def mv(self, fs_src_path, fs_dst_path, test_exists=True):
+        if test_exists:
+            if not self.is_exist(fs_src_path):
+                raise FSFileNotExistsError
+
+            if self.is_exist(fs_dst_path):
+                raise FSFileExistsError
+
+        cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def _rmr(self, fs_path):
+        cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def _rm(self, fs_path):
+        cmd = "{} -rm {}".format(self._base_cmd, fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    def delete(self, fs_path):
+        if not self.is_exist(fs_path):
+            return
+
+        is_dir = self.is_dir(fs_path)
+        if is_dir:
+            return self._rmr(fs_path)
+
+        return self._rm(fs_path)
+
+    def need_upload_download(self):
+        return True
--- a/python/paddle/fleet/utils/http_server.py
+++ b/python/paddle/fleet/utils/http_server.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Http Server."""
+
+import logging
+
+import six
+# NOTE: HTTPServer has a different name in python2 and python3
+if six.PY2:
+    from BaseHTTPServer import HTTPServer
+    import SimpleHTTPServer
+else:
+    from http.server import HTTPServer
+    import http.server as SimpleHTTPServer
+
+import time
+import threading
+import socket
+
+
+def get_logger(name, level, fmt):
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    handler = logging.FileHandler('http.log', mode='w')
+    formatter = logging.Formatter(fmt=fmt)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    return logger
+
+
+_http_server_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+    """
+    kv handler class for kv http server,
+    it defines the way to get/set kv in server.
+    """
+
+    def do_GET(self):
+        """
+        get method for kv handler, get value according to key.
+        """
+        log_str = "GET " + self.address_string() + self.path
+        paths = self.path.split('/')
+        if len(paths) < 3:
+            print('len of request path must be 3: ' + self.path)
+            self.send_status_code(400)
+            return
+        _, scope, key = paths
+        with self.server.kv_lock:
+            value = self.server.kv.get(scope, {}).get(key)
+        if value is None:
+            log_str += ' , key not found: ' + key
+            self.send_status_code(404)
+        else:
+            log_str += ' , key found: ' + key
+            self.send_response(200)
+            self.send_header("Content-Length", str(len(value)))
+            self.end_headers()
+            self.wfile.write(value)
+        _http_server_logger.info(log_str)
+
+    def do_PUT(self):
+        """
+        put method for kv handler, set value according to key.
+        """
+        log_str = "PUT " + self.address_string() + self.path
+        paths = self.path.split('/')
+        if len(paths) < 3:
+            print('len of request path must be 3: ' + self.path)
+            self.send_status_code(400)
+            return
+        _, scope, key = paths
+        content_length = int(self.headers['Content-Length'])
+        try:
+            value = self.rfile.read(content_length)
+        except:
+            print("receive error invalid request")
+            self.send_status_code(404)
+            return
+        with self.server.kv_lock:
+            if self.server.kv.get(scope) is None:
+                self.server.kv[scope] = {}
+            self.server.kv[scope][key] = value
+        self.send_status_code(200)
+        _http_server_logger.info(log_str)
+
+    def do_DELETE(self):
+        """
+        delete method for kv handler, set value according to key.
+        """
+        log_str = "DELETE " + self.address_string() + self.path
+        paths = self.path.split('/')
+        if len(paths) < 3:
+            print('len of request path must be 3: ' + self.path)
+            self.send_status_code(400)
+            return
+        _, scope, key = paths
+        with self.server.delete_kv_lock:
+            if self.server.delete_kv.get(scope) is None:
+                self.server.delete_kv[scope] = []
+            self.server.delete_kv[scope].append(key)
+        self.send_status_code(200)
+        _http_server_logger.info(log_str)
+
+    def log_message(self, format, *args):
+        """
+        ignore all logging messages in kv handler.
+        """
+        pass
+
+    def send_status_code(self, code):
+        """
+        send status code back to client.
+        """
+        self.send_response(code)
+        self.send_header("Content-Length", 0)
+        self.end_headers()
+
+
+class KVHTTPServer(HTTPServer, object):
+    """
+    it is a http server storing kv pairs.
+    """
+
+    def __init__(self, port, handler):
+        """Init."""
+        super(KVHTTPServer, self).__init__(('', port), handler)
+        self.delete_kv_lock = threading.Lock()
+        self.delete_kv = {}
+        self.kv_lock = threading.Lock()
+        self.kv = {}
+
+    def get_deleted_size(self, key):
+        """
+        get deleted size in key.
+        """
+        ret = 0
+        with self.delete_kv_lock:
+            ret = self.delete_kv.get(key, 0)
+        return ret
+
+
+class KVServer:
+    """
+    it is a server storing kv pairs, has a http server inside.
+    """
+
+    def __init__(self, port, size={}):
+        """Init."""
+        self.http_server = KVHTTPServer(port, KVHandler)
+        self.listen_thread = None
+        self.size = {}
+
+    def start(self):
+        """
+        start server until user calls stop to let it quit.
+        """
+        self.listen_thread = threading.Thread(
+            target=lambda: self.http_server.serve_forever())
+        self.listen_thread.start()
+
+    def stop(self):
+        """
+        stop server and clear its resources.
+        """
+        self.http_server.shutdown()
+        self.listen_thread.join()
+        self.http_server.server_close()
+
+    def shoud_stop(self):
+        """
+        return whether the server should stop.
+
+        Returns:
+            ret(bool): whether the server should stop
+        """
+        for key in self.size:
+            s = self.http_server.get_deleted_size(key)
+            if s != self.size.get(key, 0):
+                return False
+        return True
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -16,12 +16,13 @@ from __future__ import print_function

 import os
 import collections
-from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase
+from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer
 import pickle
 import six
 from . import learning_rate_scheduler
 import warnings
 from .. import core
+from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME, _load_persistable_vars

 __all__ = [
    'save_dygraph',
@@ -140,19 +141,80 @@ def load_dygraph(model_path, keep_name_table=False):
    elif model_prefix.endswith(".pdopt"):
        model_prefix = model_prefix[:-6]

+    para_dict = None
+    opti_dict = None
    params_file_path = model_prefix + ".pdparams"
-    if not os.path.exists(params_file_path):
-        raise RuntimeError("Parameter file [ {} ] not exists".format(
-            params_file_path))
-
+    opti_file_path = model_prefix + ".pdopt"
+    if not os.path.exists(params_file_path) and not os.path.exists(
+            opti_file_path):
+        # Load state dict by `jit.save` save format
+        # TODO(chenweihang): [Why not support `io.save_infernece_model` save format here]
+        # The model saved by `save_inference_model` does not completely correspond to 
+        # the information required by the `state_dict` under the dygraph. 
+        # Although we reluctantly restore the `state_dict` in some scenarios, 
+        # this may not be complete and there are some limitations, so this function 
+        # will be considered later. The limitations include:
+        #   1. `save_inference_model` not save structured name, we need to remind 
+        # the user to configure the `use_structured_name` argument when `set_dict`, 
+        # but this argument is currently not public
+        #   2. if `save_inference_model` save all persistable variables in a single file,
+        # user need to give the variable name list to load `state_dict`
+
+        # 1. check model path
+        if not os.path.isdir(model_prefix):
+            raise ValueError("Model saved directory '%s' is not exists." %
+                             model_prefix)
+        # 2. load `__variables.info__`
+        var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
+        if not os.path.exists(var_info_path):
+            raise RuntimeError(
+                "No target can be loaded. Now only supports loading `state_dict` from "
+                "the result saved by `imperative.save` and `imperative.jit.save`."
+            )
+        with open(var_info_path, 'rb') as f:
+            extra_var_info = pickle.load(f)
+        # 3. load `__variables__`
+        # TODO(chenweihang): now only supports loading from default save format:
+        # - all persistable vars saved in one file named `__variables__`
+        # for other case, we may need to modify the arguments of this API
+        var_file_path = os.path.join(model_prefix, VARIABLE_FILENAME)
+        if not os.path.exists(var_file_path):
+            raise RuntimeError(
+                "The parameter file to be loaded was not found. "
+                "Now only supports loading from the default save format, "
+                "and does not support custom params_filename and "
+                "save parameters separately.")
+        # 4. load all persistable vars
+        load_var_list = []
+        for name in sorted(extra_var_info):
+            var = _varbase_creator(name=name, persistable=True)
+            load_var_list.append(var)
+        _dygraph_tracer().trace_op(
+            type='load_combine',
+            inputs={},
+            outputs={'Out': load_var_list},
+            attrs={'file_path': var_file_path})
+        # 5. construct state_dict
+        para_dict = dict()
+        for var in load_var_list:
+            structured_name = extra_var_info[var.name].get('structured_name',
+                                                           None)
+            if structured_name is None:
+                raise RuntimeError(
+                    "Cannot find saved variable (%s)'s structured name in saved model.",
+                    var.name)
+            para_dict[structured_name] = var.numpy()
+        # NOTE: `jit.save` doesn't save optimizer state
+    else:
+        # Load state dict by `save_dygraph` save format
+        if os.path.exists(params_file_path):
            with open(params_file_path, 'rb') as f:
                para_dict = pickle.load(f) if six.PY2 else pickle.load(
                    f, encoding='latin1')

        if not keep_name_table and "StructuredToParameterName@@" in para_dict:
            del para_dict["StructuredToParameterName@@"]
-    opti_dict = None
-    opti_file_path = model_prefix + ".pdopt"
+
        if os.path.exists(opti_file_path):
            with open(opti_file_path, 'rb') as f:
                opti_dict = pickle.load(f) if six.PY2 else pickle.load(

--- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import traceback
+
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import Location, OriginInfo, global_origin_info_map
+
+ERROR_DATA = "Error data about original source code information and traceback."
+
+
+def attach_error_data(error):
+    """
+    Attachs error data about original source code information and traceback to an error.
+
+    Args:
+        error(Exception): An native error.
+
+    Returns:
+        An error attached data about original source code information and traceback.
+    """
+    e_type, e_value, e_traceback = sys.exc_info()
+    tb = traceback.extract_tb(e_traceback)[1:]
+
+    error_data = ErrorData(e_type, e_value, tb, global_origin_info_map)
+    setattr(error, ERROR_DATA, error_data)
+
+    return error
+
+
+class TraceBackFrame(OriginInfo):
+    """
+    Traceback frame information.
+    """
+
+    def __init__(self, location, function_name, source_code):
+        self.location = location
+        self.function_name = function_name
+        self.source_code = source_code
+
+
+class ErrorData(object):
+    """
+    Error data attached to an exception which is raised in un-transformed code.
+
+    TODO(liym27): Consider the case that op_callstack when error raised from c++ code
+    """
+
+    def __init__(self, error_type, error_value, origin_traceback,
+                 origin_info_map):
+        self.error_type = error_type
+        self.error_value = error_value
+        self.origin_traceback = origin_traceback
+        self.origin_info_map = origin_info_map
+
+    def create_exception(self):
+        message = self.create_message()
+        new_exception = self.error_type(message)
+        setattr(new_exception, ERROR_DATA, self)
+        return new_exception
+
+    def create_message(self):
+        """
+        Creates a custom error message which includes trace stack with source code information of dygraph from user.
+        """
+        message_lines = []
+
+        # Step1: Adds header message to prompt users that the following is the original information.
+        header_message = "In user code:"
+        message_lines.append(header_message)
+        message_lines.append("")
+
+        # Step2: Optimizes stack information with source code information of dygraph from user.
+        for filepath, lineno, funcname, code in self.origin_traceback:
+            loc = Location(filepath, lineno)
+
+            dygraph_func_info = self.origin_info_map.get(loc.line_location,
+                                                         None)
+            if dygraph_func_info:
+                # TODO(liym27): more information to prompt users that this is the original information.
+                # Replaces trace stack information about transformed static code with original dygraph code.
+                traceback_frame = self.origin_info_map[loc.line_location]
+            else:
+                traceback_frame = TraceBackFrame(loc, funcname, code)
+
+            message_lines.append(traceback_frame.formated_message())
+
+        # Step3: Adds error message like "TypeError: dtype must be int32, but received float32".
+        error_message = " " * 4 + traceback.format_exception_only(
+            self.error_type, self.error_value)[0].strip("\n")
+        message_lines.append(error_message)
+
+        return '\n'.join(message_lines)
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -39,32 +39,21 @@ GENERATE_VARIABLE_PREFIX = 'generate_variable'


 def create_while_node(condition_name, body_name, loop_var_names):
-    while_args = []
-    while_args.append(
-        gast.Name(
-            id=condition_name,
-            ctx=gast.Param(),
-            annotation=None,
-            type_comment=None))
-    while_args.append(
-        gast.Name(
-            id=body_name, ctx=gast.Param(), annotation=None, type_comment=None))
-    assign_targets = [
-        gast.Name(
-            id=var_name, ctx=gast.Param(), annotation=None, type_comment=None)
-        for var_name in loop_var_names
-    ]
-    while_args.append(gast.List(elts=assign_targets, ctx=gast.Param()))
-
-    while_func_id = gast.parse(
-        'fluid.dygraph.dygraph_to_static.convert_operators.convert_while_loop'
-    ).body[0].value
-    while_node = gast.Call(func=while_func_id, args=while_args, keywords=[])
-    assign_node = gast.Assign(
-        targets=[gast.Tuple(
-            elts=assign_targets, ctx=gast.Store())],
-        value=while_node)
-    return assign_node
+    # NOTE(liym27):
+    # It's better to parse the source code into an AST node than to customize an AST node
+    # including child nodes, because it is easy to mistake the ast node type when customizing the node.
+    #
+    # For example: loop_var_names = [a, b, foo.x], the type of `a` or `b` is gast.Name,
+    # but the type of `foo.x` gast.Attribute.
+
+    while_func_name = "fluid.dygraph.dygraph_to_static.convert_operators.convert_while_loop"
+    while_node_str = "[{}] = {}({}, {}, [{}])".format(
+        ",".join(loop_var_names), while_func_name, condition_name, body_name,
+        ",".join(loop_var_names))
+
+    while_node = gast.parse(while_node_str).body[0]
+
+    return while_node


 class NameVisitor(gast.NodeVisitor):

--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -21,6 +21,7 @@ import gast

 # NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node.
 ORIGI_INFO = "Original information of source code for ast node."
+ORIGI_INFO_MAP = "Original information map of source code."


 class Location(object):
@@ -64,6 +65,11 @@ class OriginInfo(object):
        return "{} \nsource_code: {}  in function {}\n  ".format(
            self.location, self.source_code, self.function_name)

+    def formated_message(self):
+        return '    File "{}", line {}, in {}\n\t{}'.format(
+            self.location.filepath, self.location.lineno, self.function_name,
+            self.source_code.lstrip())
+

 class OriginInfoAttacher(gast.NodeTransformer):
    """
@@ -119,7 +125,12 @@ class OriginInfoAttacher(gast.NodeTransformer):
        return self.col_offset + node.col_offset


-def create_origin_info_map(transformed_node, static_func):
+global_origin_info_map = {}
+
+
+def create_and_update_origin_info_map(transformed_node,
+                                      static_func,
+                                      is_global=True):
    """
    Creates a original information map between transformed static function and original dygraph function.

@@ -156,6 +167,10 @@ def create_origin_info_map(transformed_node, static_func):

        origin_info_map[static_loc] = dygraph_info

+    global_origin_info_map.update(origin_info_map)
+    if is_global:
+        return global_origin_info_map
+
    return origin_info_map



--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -47,8 +47,7 @@ class PrintTransformer(gast.NodeTransformer):
    # NOTE: deal with print in PY3
    def visit_Call(self, node):
        if isinstance(node.func, gast.Name) and node.func.id == 'print':
-            convert_print_node = self._create_print_node(node.args)
-            return gast.Expr(value=convert_print_node)
+            node = self._create_print_node(node.args)
        return node

    # NOTE: deal with print in PY2

--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -36,6 +36,8 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 from paddle.fluid.dygraph.base import param_guard
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info, create_and_update_origin_info_map
+from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data, ERROR_DATA

 __all__ = ['ProgramTranslator', 'convert_to_static']

@@ -88,15 +90,23 @@ class FunctionCache(object):
        # with decorator directly and function.__wrapped__ holds the actual function.
        func = getattr(func, '__wrapped__', func)
        source_code = func_to_source_code(func)
+
+        # TODO(liym27):
+        #  Consider this case: source_code in self._code_to_ast_caches,
+        #  but actually they are methods in different classes.
+        #  Maybe use (__class__, source_code) as key
        if source_code in self._code_to_ast_caches:
            root_wrapper = self._code_to_ast_caches[source_code]
        else:
            root = gast.parse(source_code)
+            root = attach_origin_info(root, func)
            root_wrapper = self._dygraph_to_static.get_static_ast(root)
            self._code_to_ast_caches[source_code] = root_wrapper

        # Get static function from AST
        static_func, file_name = ast_to_func(root_wrapper.node, func)
+
+        create_and_update_origin_info_map(root_wrapper.node, static_func)
        return static_func

    def exist(self, func):
@@ -125,6 +135,7 @@ class FunctionSpec(object):
        self._args = args
        self._kwargs = kwargs

+        # TODO(liym27): func has multi layer decorator
        dyfunc = getattr(func, '__wrapped__', func)
        self._dyfunc_code = inspect.getsource(dyfunc)

@@ -282,7 +293,13 @@ class ConcreteProgram(object):
                # 3. Builds program only once and returns the output Variables.
                with param_guard(func_spec.parameters(False)), param_guard(
                        func_spec.buffers(False)):
+                    try:
                        outputs = static_func(*inputs)
+                    except BaseException as e:
+                        # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
+                        attach_error_data(e)
+                        raise
+
                if not isinstance(outputs,
                                  (tuple, list)) and outputs is not None:
                    outputs = [outputs]
@@ -483,15 +500,25 @@ class ProgramTranslator(object):
            return dygraph_func(*args, **kwargs)

        function_spec = FunctionSpec(dygraph_func, args, kwargs)
-        _, partial_program_layer = self._program_cache[function_spec]
+        concrete_program, partial_program_layer = self._program_cache[
+            function_spec]

        if args and isinstance(args[0], layers.Layer):
            # Synchronize self.training attribute.
            partial_program_layer.training = args[0].training
            args = args[1:]
-
+        try:
            return partial_program_layer(args)

+        except BaseException as e:
+            # NOTE:
+            # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
+            # 2. If e raised in runtime, e should be attached to ERROR_DATA here.
+            if not hasattr(e, ERROR_DATA):
+                # runtime error
+                attach_error_data(e)
+            raise
+
    def get_func(self, dygraph_func):
        """
        Returns a callable function which converts imperative dygraph APIs of

--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -425,8 +425,7 @@ def _load_persistable_vars(model_path,
                           params_filename=None):
    # 1. load extra var info
    with open(var_info_path, 'rb') as f:
-        extra_var_info = pickle.load(f) if six.PY2 else pickle.load(
-            f, encoding='latin1')
+        extra_var_info = pickle.load(f)

    # 2. construct var dict
    load_var_dict = dict()

--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -15,20 +15,23 @@
 from __future__ import print_function

 import os
-import six
 import pickle
-
 import warnings
+
+import six
 from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
+from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import FunctionSpec, ProgramTranslator
+from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
-from paddle.fluid.framework import Program, Block, Variable, ParamBase, _dygraph_tracer, dygraph_only, _dygraph_guard, _current_expected_place, in_dygraph_mode
+from paddle.fluid.framework import Block, ParamBase, Program, Variable
+from paddle.fluid.framework import _current_expected_place, _dygraph_guard, _dygraph_tracer
+from paddle.fluid.framework import dygraph_only, in_dygraph_mode
 from paddle.fluid.wrapped_decorator import wrap_decorator
-from paddle.fluid.dygraph.io import TranslatedLayer, VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME

 __all__ = ['TracedLayer', 'declarative', 'dygraph_to_static_func']

@@ -167,7 +170,15 @@ def _declarative_(dygraph_func):
                "The decorator 'declarative' doesn't work when setting ProgramTranslator.enable=False. "
                "We will just return dygraph output.")
            return dygraph_func(*args, **kwargs)
+        try:
            return program_translator.get_output(dygraph_func, *args, **kwargs)
+        except Exception as e:
+            error_data = getattr(e, ERROR_DATA, None)
+            if error_data:
+                new_exception = error_data.create_exception()
+                raise new_exception
+            else:
+                raise

    return __impl__


--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -35,7 +35,7 @@ __all__ = [
    'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding',
    'GRUUnit', 'InstanceNorm', 'LayerNorm', 'NCE', 'PRelu',
    'BilinearTensorProduct', 'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm',
-    'SpectralNorm', 'TreeConv'
+    'SpectralNorm', 'TreeConv', 'Flatten'
 ]


@@ -3182,3 +3182,62 @@ class TreeConv(layers.Layer):
        else:
            pre_activation = out
        return self._helper.append_activation(pre_activation, act=self._act)
+
+
+class Flatten(layers.Layer):
+    """
+    :alias_main: paddle.nn.Flatten
+    :alias: paddle.nn.Flatten,paddle.nn.layer.Flatten,paddle.nn.layer.common.Flatten
+    This interface is used to construct a callable object of the ``FLatten`` class.
+    For more details, refer to code examples.
+    It implements flatten a contiguous range of dims into a tensor.
+
+    Equation:
+
+    Parameters:
+        start_axis(int): first dim to flatten (default = 1)
+        stop_axis(int): last dim to flatten (default = -1).
+    
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          from paddle.imperative import to_variable
+          import numpy as np
+
+          inp_np = np.ones([5, 2, 3, 4]).astype('float32')
+          
+          paddle.enable_imperative()
+          
+          inp_np = to_variable(inp_np)
+          flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
+          flatten_res = flatten(inp_np)
+
+    """
+
+    def __init__(self, start_axis=1, stop_axis=-1):
+        super(Flatten, self).__init__()
+        self.start_axis = start_axis
+        self.stop_axis = stop_axis
+
+    def forward(self, input):
+        out = self._helper.create_variable_for_type_inference(input.dtype)
+        x_shape = self._helper.create_variable_for_type_inference(input.dtype)
+
+        if in_dygraph_mode():
+            dy_out, _ = core.ops.flatten_contiguous_range(
+                input, 'start_axis', self.start_axis, 'stop_axis',
+                self.stop_axis)
+            return dy_out
+        self._helper.append_op(
+            type="flatten_contiguous_range",
+            inputs={"X": input},
+            outputs={"Out": out,
+                     "XShape": x_shape},
+            attrs={"start_axis": self.start_axis,
+                   "stop_axis": self.stop_axis})
+        return out
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@@ -21,7 +21,7 @@ from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGD

 from paddle.fluid.incubate.fleet.base.mode import Mode
-from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase
+from paddle.fleet.base.role_maker import RoleMakerBase
 from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecision
 from . import mode

@@ -209,7 +209,10 @@ class Fleet(object):
        self._executor = Executor(fluid.CPUPlace())

        if role_maker and not isinstance(role_maker, RoleMakerBase):
-            raise TypeError("role_maker must be an instance of RoleMakerBase")
+            from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase as RoleMakerBaseIncubate
+            if role_maker and not isinstance(role_maker, RoleMakerBaseIncubate):
+                raise TypeError(
+                    "role_maker must be an instance of RoleMakerBase")

        self._role_maker = role_maker
        self._role_maker.generate_role()

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9911,7 +9911,7 @@ def flatten(x, axis=1, name=None):
    return out


-def stack(x, axis=0):
+def stack(x, axis=0, name=None):
    """

    This OP stacks all the inputs :code:`x` along axis.
@@ -9991,15 +9991,16 @@ def stack(x, axis=0):
            data = layers.stack(x1)  # stack according to axis 0, data.shape=[1, None, 1, 2]

    """
-
-    helper = LayerHelper('stack', **locals())
    axis = 0 if axis is None else axis
-
    if not isinstance(x, list) and not isinstance(x, tuple):
        x = [x]
+
+    if in_dygraph_mode():
+        return core.ops.stack(x, 'axis', axis)
+
+    helper = LayerHelper('stack', **locals())
    out = helper.create_variable_for_type_inference(x[0].dtype)
-    if not in_dygraph_mode() and \
-            x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+    if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
        assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
                            "number of the elements must be 1, but received %s." % len(x)
        out_index = helper.create_variable_for_type_inference(dtype="int32")

--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -28,7 +28,6 @@ from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_bu
 from .unique_name import UniqueNameGenerator
 import logging
 import warnings
-from .dataset import DatasetBase, InMemoryDataset

 ### Dygraph DataLoader configs ###
 import os
@@ -1670,7 +1669,7 @@ class PyReader(DataLoaderBase):

 class DatasetLoader(DataLoaderBase):
    def __init__(self, dataset, places, drop_last):
-        assert isinstance(dataset,
+        assert isinstance(dataset, paddle.fleet.dataset.
                          DatasetBase), "dataset must be type of DatasetBase"
        assert not in_dygraph_mode(
        ), "DatasetLoader is not supported in dygraph mode yet"
@@ -1686,7 +1685,7 @@ class DatasetLoader(DataLoaderBase):

        dataset.set_thread(thread_num)

-        if isinstance(dataset,
+        if isinstance(dataset, paddle.fleet.dataset.
                      InMemoryDataset) and dataset.queue_num > thread_num:
            logging.warn("queue_num {} which is set in Dataset is ignored".
                         format(dataset.queue_num))

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -345,7 +345,6 @@ if(WITH_DISTRIBUTE)
    # FIXME(typhoonzero): add these tests back
    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")

    #not need
    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base")

--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -28,6 +28,7 @@ import numpy as np

 import ctr_dataset_reader
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
+from paddle.fleet.base.util_factory import fleet_util

 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
@@ -181,8 +182,14 @@ class TestDistCTR2x2(FleetDistRunnerBase):
                    loss_val = exe.run(program=compiled_prog,
                                       fetch_list=[self.avg_cost.name])
                    loss_val = np.mean(loss_val)
-                    print("TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
-                                                                  loss_val))
+                    reduce_output = fleet_util.all_reduce(
+                        np.array(loss_val), mode="sum")
+                    loss_all_trainer = fleet_util.all_gather(float(loss_val))
+                    loss_val = float(reduce_output) / len(loss_all_trainer)
+                    message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
+                                                                      loss_val)
+                    fleet_util.print_on_rank(message, 0)
+
                pass_time = time.time() - pass_start
            except fluid.core.EOFException:
                self.reader.reset()
@@ -210,7 +217,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
            filelist.append(train_file_path)

        # config dataset
-        dataset = fluid.DatasetFactory().create_dataset()
+        dataset = paddle.fleet.DatasetFactory().create_dataset()
        dataset.set_batch_size(batch_size)
        dataset.set_use_var(self.feeds)
        pipe_command = 'python ctr_dataset_reader.py'

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import inspect
+import unittest
+
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.core import EnforceNotMet
+from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA, ErrorData
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import unwrap
+from paddle.fluid.dygraph.jit import declarative
+
+
+def inner_func():
+    fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")
+    return
+
+
+@declarative
+def func_error_in_compile_time(x):
+    x = fluid.dygraph.to_variable(x)
+    inner_func()
+    if fluid.layers.mean(x) < 0:
+        x_v = x - 1
+    else:
+        x_v = x + 1
+    return x_v
+
+
+@declarative
+def func_error_in_compile_time_2(x):
+    x = fluid.dygraph.to_variable(x)
+    x = fluid.layers.reshape(x, shape=[1, 2])
+    return x
+
+
+@declarative
+def func_error_in_runtime(x, iter_num=3):
+    x = fluid.dygraph.to_variable(x)
+    a = []
+    iter_num = fluid.layers.fill_constant(
+        shape=[1], value=iter_num, dtype="int32")
+    for i in range(iter_num):
+        a.append(b)
+    a = fluid.layers.concat(a, axis=0)
+    return a
+
+
+class TestErrorInCompileTime(unittest.TestCase):
+    def setUp(self):
+        self.set_func()
+        self.set_input()
+        self.set_exception_type()
+
+    def set_func(self):
+        self.func = func_error_in_compile_time
+
+    def set_exception_type(self):
+        self.exception_type = TypeError
+
+    def set_input(self):
+        self.input = np.ones([3, 2])
+
+    def set_message(self):
+        self.expected_message = \
+            ['File "{}", line 36, in func_error_in_compile_time'.format(self.filepath),
+            'inner_func()',
+            'File "{}", line 29, in inner_func'.format(self.filepath),
+            'fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")',
+            ]
+
+    def _test_create_message(self, error_data):
+        self.filepath = inspect.getfile(unwrap(self.func))
+        self.set_message()
+        error_message = error_data.create_message()
+
+        self.assertIn('In user code:', error_message)
+        for m in self.expected_message:
+            self.assertIn(m, error_message)
+
+    def test(self):
+        with fluid.dygraph.guard():
+            with self.assertRaises(self.exception_type) as cm:
+                self.func(self.input)
+            exception = cm.exception
+            error_data = getattr(exception, ERROR_DATA)
+            self.assertIsInstance(error_data, ErrorData)
+            self._test_create_message(error_data)
+
+
+class TestErrorInCompileTime2(TestErrorInCompileTime):
+    def set_func(self):
+        self.func = func_error_in_compile_time_2
+
+    def set_exception_type(self):
+        self.exception_type = EnforceNotMet
+
+    def set_message(self):
+
+        self.expected_message = \
+            [
+             'File "{}", line 47, in func_error_in_compile_time_2'.format(self.filepath),
+             'x = fluid.layers.reshape(x, shape=[1, 2])'
+             ]
+
+
+# TODO(liym27): Consider the case that op_callstack when error raised from c++ code
+class TestErrorInRuntime(TestErrorInCompileTime):
+    def set_func(self):
+        self.func = func_error_in_runtime
+
+    def set_exception_type(self):
+        self.exception_type = EnforceNotMet
+
+    def test(self):
+        with fluid.dygraph.guard():
+            with self.assertRaises(self.exception_type) as cm:
+                self.func(self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
@@ -90,7 +90,8 @@ class TestOriginInfo(unittest.TestCase):

        # step3
        self.static_func, _ = ast_to_func(transformed_ast, self.dygraph_func)
-        info_map = create_origin_info_map(dygraph_ast, self.static_func)
+        info_map = create_and_update_origin_info_map(dygraph_ast,
+                                                     self.static_func)

        return info_map


--- a/python/paddle/fluid/tests/unittests/multi_process.py
+++ b/python/paddle/fluid/tests/unittests/multi_process.py
@@ -17,7 +17,7 @@ import sys
 import time


-def train():
+def train(prefix):
    selected_gpus = os.getenv("FLAGS_selected_gpus")
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
@@ -29,11 +29,12 @@ def train():
        .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)

    print(name)
-    with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
+    with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+              "w") as f:
        f.write(name)


-def train_abort():
+def train_abort(prefix):
    selected_gpus = os.getenv("FLAGS_selected_gpus")
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
@@ -49,7 +50,8 @@ def train_abort():
            name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
                .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
            print(name)
-            with open("multi_process.check_{}.log".format(trainer_id),
+            with open(
+                    "multi_process_{}.check_{}.log".format(prefix, trainer_id),
                    "w") as f:
                f.write(name)
            raise
@@ -60,12 +62,15 @@ def train_abort():
            .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)

        print(name)
-        with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
+        with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+                  "w") as f:
            f.write(name)


 if __name__ == '__main__':
-    if len(sys.argv) == 2 and sys.argv[1] == "abort":
-        train_abort()
+    if len(sys.argv) == 3 and sys.argv[2] == "abort":
+        prefix = sys.argv[1]
+        train_abort(prefix)
    else:
-        train()
+        prefix = sys.argv[1]
+        train(prefix)
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -63,18 +63,104 @@ class TestAddMMOpError(unittest.TestCase):
    def test_errors(self):
        with program_guard(Program(), Program()):
            # The input type of addmm_op must be Variable.
+
            input = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
            x2 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
            self.assertRaises(TypeError, paddle.addmm, input, x1, x2)
+
            # The input dtype of mul_op must be float32 or float64.
-            input = fluid.layers.data(name='input', shape=[4], dtype="int32")
-            x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32")
-            x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32")
+            input = fluid.layers.data(
+                name='input',
+                shape=[4, 4],
+                dtype="int32",
+                append_batch_size=False)
+            x3 = fluid.layers.data(
+                name='x3', shape=[4, 4], dtype="int32", append_batch_size=False)
+            x4 = fluid.layers.data(
+                name='x4', shape=[4, 4], dtype="int32", append_batch_size=False)
            self.assertRaises(TypeError, paddle.addmm, input, x3, x4)
+            # x and y dimension mismatch
+            x5 = fluid.layers.data(
+                name='x5',
+                shape=[4, 5],
+                dtype="float32",
+                append_batch_size=False)
+            x6 = fluid.layers.data(
+                name='x6',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input, x5, x6)
+            # input and x are not broadcastable
+            x7 = fluid.layers.data(
+                name='x7',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x8 = fluid.layers.data(
+                name='x8',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input1 = fluid.layers.data(
+                name='input1',
+                shape=[2, 4],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input1, x7, x8)
+            # input and x are not broadcastable
+            x9 = fluid.layers.data(
+                name='x9',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x10 = fluid.layers.data(
+                name='x10',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input2 = fluid.layers.data(
+                name='input2',
+                shape=[1, 2],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input2, x9, x10)
+            x11 = fluid.layers.data(
+                name='x11',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x12 = fluid.layers.data(
+                name='x12',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input3 = fluid.layers.data(
+                name='input3',
+                shape=[4, 2],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input3, x11, x12)
+            x13 = fluid.layers.data(
+                name='x13',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x14 = fluid.layers.data(
+                name='x14',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input4 = fluid.layers.data(
+                name='input4',
+                shape=[3, 1],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input4, x13, x14)


 class TestAddMMOp2(TestAddMMOp):
@@ -147,5 +233,23 @@ class TestAddMMOp4(unittest.TestCase):
            assert np.allclose(np_input + np.dot(np_x, np_y), out.numpy())


+'''
+class TestAddMMAPI(unittest.TestCase):
+    def test_api_error(self):
+        data_x = np.ones((2, 2)).astype(np.float32)
+        data_y = np.ones((2, 2)).astype(np.float32)
+        data_input = np.ones((2, 2)).astype(np.float32)
+
+        paddle.enable_imperative()
+
+        def test_error1():
+            data_x_wrong = np.ones((2, 3)).astype(np.float32)
+            x = paddle.imperative.to_variable(data_x_wrong)
+            y = paddle.imperative.to_variable(data_y)
+            input = paddle.imperative.to_variable(data_input)
+            out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
+        self.assertRaises(ValueError, test_error1)
+'''
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_bmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bmm_op.py
@@ -73,5 +73,15 @@ class API_TestDygraphBmm(unittest.TestCase):
        self.assertTrue(np.allclose(expected_result, out_np))


+class TestBmmAPIError(unittest.TestCase):
+    def test_api_error(self):
+        x_data = np.arange(24, dtype='float32').reshape((2, 3, 4))
+        y_data = np.arange(16, dtype='float32').reshape((2, 4, 2))
+        y_data_wrong1 = np.arange(16, dtype='float32').reshape((2, 2, 4))
+        y_data_wrong2 = np.arange(16, dtype='float32').reshape((2, 2, 2, 2))
+        self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong1)
+        self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong2)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -17,6 +17,7 @@ including create, config, run, etc.
 """

 from __future__ import print_function
+import paddle
 import paddle.fluid as fluid
 import paddle.compat as cpt
 import paddle.fluid.core as core
@@ -37,23 +38,26 @@ class TestDataset(unittest.TestCase):
    def test_dataset_create(self):
        """ Testcase for dataset create. """
        try:
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset = paddle.fleet.DatasetFactory().create_dataset(
+                "InMemoryDataset")
        except:
            self.assertTrue(False)

        try:
-            dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+            dataset = paddle.fleet.DatasetFactory().create_dataset(
+                "QueueDataset")
        except:
            self.assertTrue(False)

        try:
-            dataset = fluid.DatasetFactory().create_dataset(
+            dataset = paddle.fleet.DatasetFactory().create_dataset(
                "FileInstantDataset")
        except:
            self.assertTrue(False)

        try:
-            dataset = fluid.DatasetFactory().create_dataset("MyOwnDataset")
+            dataset = paddle.fleet.DatasetFactory().create_dataset(
+                "MyOwnDataset")
            self.assertTrue(False)
        except:
            self.assertTrue(True)
@@ -91,7 +95,8 @@ class TestDataset(unittest.TestCase):
                name=slot, shape=[1], dtype="int64", lod_level=1)
            slots_vars.append(var)

-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist(
@@ -125,7 +130,7 @@ class TestDataset(unittest.TestCase):
        dataset.set_trainer_num(4)
        dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
        dataset.set_download_cmd("./read_from_afs my_fs_name my_fs_ugi")
-        dataset.enable_pv_merge()
+        dataset.set_enable_pv_merge(False)

        thread_num = dataset.get_thread_num()
        self.assertEqual(thread_num, 12)
@@ -171,7 +176,8 @@ class TestDataset(unittest.TestCase):
                name=slot, shape=[1], dtype="int64", lod_level=1)
            slots_vars.append(var)

-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist([filename1, filename2])
@@ -222,7 +228,8 @@ class TestDataset(unittest.TestCase):
                name=slot, shape=[1], dtype="int64", lod_level=1)
            slots_vars.append(var)

-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist([
@@ -293,7 +300,8 @@ class TestDataset(unittest.TestCase):
                    name=slot, shape=[1], dtype="float32", lod_level=1)
                slots_vars.append(var)

-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(1)
        dataset.set_parse_ins_id(True)
@@ -359,7 +367,8 @@ class TestDataset(unittest.TestCase):
                name="slot4", shape=[1], dtype="float32", lod_level=0)
            slots_vars = [var1, var2, var3, var4]

-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(1)
        dataset.set_parse_ins_id(True)
@@ -414,7 +423,8 @@ class TestDataset(unittest.TestCase):
                name=slot, shape=[1], dtype="float32", lod_level=1)
            slots_vars.append(var)

-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist([
@@ -507,7 +517,7 @@ class TestDataset(unittest.TestCase):
                name=slot, shape=[1], dtype="int64", lod_level=1)
            slots_vars.append(var)

-        dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist(
@@ -532,7 +542,7 @@ class TestDataset(unittest.TestCase):
                except Exception as e:
                    self.assertTrue(False)

-        dataset2 = fluid.DatasetFactory().create_dataset("QueueDataset")
+        dataset2 = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
        dataset2.set_use_var(slots_vars)
        dataset2.set_batch_size(32)
        dataset2.set_thread(3)
@@ -573,7 +583,7 @@ class TestDataset(unittest.TestCase):
                name=slot, shape=[1], dtype="float32", lod_level=1)
            slots_vars.append(var)

-        dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist(
@@ -628,7 +638,8 @@ class TestDataset(unittest.TestCase):
                name=slot, shape=[None, 1], dtype="int64", lod_level=1)
            slots_vars.append(var)

-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
        dataset.set_input_type(1)
        dataset.set_batch_size(1)
        dataset.set_thread(2)
@@ -707,7 +718,7 @@ class TestDatasetWithFetchHandler(unittest.TestCase):
            inputs(list): inputs of get_dataset
            files(list): files of  get_dataset
        """
-        dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist(files)
@@ -864,7 +875,8 @@ class TestDataset2(unittest.TestCase):
            except ImportError as e:
                print("warning: no mpi4py")
            exe.run(startup_program)
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset = paddle.fleet.DatasetFactory().create_dataset(
+                "InMemoryDataset")
            dataset.set_batch_size(32)
            dataset.set_thread(3)
            dataset.set_filelist([
@@ -884,9 +896,6 @@ class TestDataset2(unittest.TestCase):
        """
        Testcase for InMemoryDataset from create to run.
        """
-
-        self.skipTest("parameter server will add pslib UT later")
-
        with open("test_in_memory_dataset2_run2_a.txt", "w") as f:
            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
@@ -902,7 +911,7 @@ class TestDataset2(unittest.TestCase):
        train_program = fluid.Program()
        startup_program = fluid.Program()
        scope = fluid.Scope()
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
        with fluid.program_guard(train_program, startup_program):
            slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"]
            slots_vars = []
@@ -936,7 +945,8 @@ class TestDataset2(unittest.TestCase):
            except ImportError as e:
                print("warning: no mpi4py")
            exe.run(startup_program)
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset = paddle.fleet.DatasetFactory().create_dataset(
+                "InMemoryDataset")
            dataset.set_batch_size(32)
            dataset.set_thread(3)
            dataset.set_filelist([
@@ -952,6 +962,63 @@ class TestDataset2(unittest.TestCase):
                print("warning: catch expected error")
            fleet._opt_info = None
            fleet._fleet_ptr = None
+            dataset = paddle.fleet.DatasetFactory().create_dataset(
+                "InMemoryDataset")
+            dataset.set_rank_offset("")
+            dataset.set_pv_batch_size(1)
+            dataset.set_hdfs_config("", "")
+            d = paddle.fleet.DatasetBase()
+            try:
+                dataset.set_feed_type("MultiSlotInMemoryDataFeed")
+            except:
+                print("warning: catch expected error")
+            dataset.thread_num = 0
+            try:
+                dataset._prepare_to_run()
+            except:
+                print("warning: catch expected error")
+            dataset.set_parse_logkey(True)
+            dataset.set_merge_by_sid(True)
+            dataset.set_enable_pv_merge(True)
+            try:
+                dataset.preprocess_instance()
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.set_current_phase(1)
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.postprocess_instance()
+            except:
+                print("warning: catch expected error")
+            dataset.set_fleet_send_batch_size(1024)
+            try:
+                dataset.global_shuffle()
+            except:
+                print("warning: catch expected error")
+            dataset.get_pv_data_size()
+            dataset.get_memory_data_size()
+            dataset.get_shuffle_data_size()
+            dataset = paddle.fleet.DatasetFactory().create_dataset(
+                "QueueDataset")
+            try:
+                dataset.local_shuffle()
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.global_shuffle()
+            except:
+                print("warning: catch expected error")
+            dataset = paddle.fleet.FileInstantDataset()
+            try:
+                dataset.local_shuffle()
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.global_shuffle()
+            except:
+                print("warning: catch expected error")

        os.remove("./test_in_memory_dataset2_run2_a.txt")
        os.remove("./test_in_memory_dataset2_run2_b.txt")

--- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import paddle
 import paddle.fluid as fluid
 import numpy as np
 import six
@@ -96,7 +97,8 @@ class DatasetLoaderTestBase(unittest.TestCase):

    def check_batch_number(self, place, randomize_batch_num=False):
        main_prog, startup_prog, feeds = self.build_network()
-        dataset = fluid.DatasetFactory().create_dataset(self.dataset_name)
+        dataset = paddle.fleet.DatasetFactory().create_dataset(
+            self.dataset_name)
        dataset.set_batch_size(BATCH_SIZE)

        if isinstance(place, fluid.CPUPlace):

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -21,6 +21,9 @@ import os
 import sys
 import subprocess

+import six
+import shutil
+import numpy as np
 import argparse
 from contextlib import closing
 import socket
@@ -29,7 +32,8 @@ import tempfile
 import unittest

 import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.fleet.base.role_maker as role_maker
+from paddle.fleet.base.util_factory import fleet_util
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory

@@ -48,18 +52,26 @@ class FleetDistRunnerBase(object):
    """

    def build_role(self, args):
+
        if args.role.upper() == "PSERVER":
            role = role_maker.UserDefinedRoleMaker(
+                is_collective=False,
+                init_gloo=True,
+                path=args.gloo_path,
                current_id=args.current_id,
                role=role_maker.Role.SERVER,
-                worker_num=args.trainers,
+                worker_endpoints=args.trainer_endpoints.split(","),
                server_endpoints=args.endpoints.split(","))
        else:
            role = role_maker.UserDefinedRoleMaker(
+                is_collective=False,
+                init_gloo=True,
+                path=args.gloo_path,
                current_id=args.current_id,
                role=role_maker.Role.WORKER,
-                worker_num=args.trainers,
+                worker_endpoints=args.trainer_endpoints.split(","),
                server_endpoints=args.endpoints.split(","))
+        self.role = role
        return role

    def build_strategy(self, args):
@@ -114,26 +126,13 @@ class FleetDistRunnerBase(object):
        optimizer.minimize(avg_cost)

    def run_pserver(self, args):
-        fleet.init(self.build_role(args))
-        strategy = self.build_strategy(args)
-        avg_cost = self.net(args)
-        self.build_optimizer(avg_cost, strategy)
-
        fleet.init_server()
        fleet.run_server()

    def run_dataset_trainer(self, args):
-        fleet.init(self.build_role(args))
-        strategy = self.build_strategy(args)
-        avg_cost = self.net(args)
-        self.build_optimizer(avg_cost, strategy)
        out = self.do_dataset_training(fleet)

    def run_pyreader_trainer(self, args):
-        fleet.init(self.build_role(args))
-        strategy = self.build_strategy(args)
-        avg_cost = self.net(args)
-        self.build_optimizer(avg_cost, strategy)
        out = self.do_pyreader_training(fleet)

    def net(self, args, batch_size=4, lr=0.01):
@@ -173,10 +172,14 @@ class TestFleetBase(unittest.TestCase):
            print("set begin_port:", DIST_UT_PORT)
            self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
                DIST_UT_PORT, DIST_UT_PORT + 1)
-            DIST_UT_PORT += 2
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT + 2, DIST_UT_PORT + 3)
+            DIST_UT_PORT += 4
        else:
            self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
                self._find_free_port(), self._find_free_port())
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())

        self._python_interp = sys.executable
        self._geo_sgd_need_push_nums = 5
@@ -236,18 +239,22 @@ class TestFleetBase(unittest.TestCase):
    def _run_cluster(self, model, envs):
        env = {'GRAD_CLIP': str(self._grad_clip_mode)}
        python_path = self._python_interp
+        gloo_path = tempfile.mkdtemp()
+
        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
            envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
            python_path += " -m coverage run --branch -p"
        env.update(envs)

-        tr_cmd = "{0} {1} --role trainer --endpoints {2} --current_id {{}} --trainers {3} --mode {4} --geo_sgd_need_push_nums {5} --reader {6}".format(
-            python_path, model, self._ps_endpoints, self._trainers, self._mode,
-            self._geo_sgd_need_push_nums, self._reader)
+        tr_cmd = "{0} {1} --role trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path)

-        ps_cmd = "{0} {1} --role pserver --endpoints {2} --current_id {{}} --trainers {3} --mode {4} --geo_sgd_need_push_nums {5} --reader {6}".format(
-            python_path, model, self._ps_endpoints, self._trainers, self._mode,
-            self._geo_sgd_need_push_nums, self._reader)
+        ps_cmd = "{0} {1} --role pserver --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path)

        # Run dist train to compare with local results
        ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
@@ -284,6 +291,7 @@ class TestFleetBase(unittest.TestCase):
        ps0.terminate()
        ps1.terminate()

+        shutil.rmtree(gloo_path)
        return 0, 0

    def check_with_place(self,
@@ -313,6 +321,9 @@ def runtime_main(test_class):
    parser.add_argument(
        '--role', type=str, required=True, choices=['pserver', 'trainer'])
    parser.add_argument('--endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--trainer_endpoints', type=str, required=False, default="")
+    parser.add_argument('--gloo_path', type=str, required=False, default="")
    parser.add_argument('--current_id', type=int, required=False, default=0)
    parser.add_argument('--trainers', type=int, required=False, default=1)
    parser.add_argument('--mode', type=str, required=False, default='geo')
@@ -322,6 +333,13 @@ def runtime_main(test_class):
    args = parser.parse_args()

    model = test_class()
+    role = model.build_role(args)
+    fleet.init(role)
+    strategy = model.build_strategy(args)
+    avg_cost = model.net(args)
+    model.build_optimizer(avg_cost, strategy)
+    fleet_util._set_strategy(strategy)
+    fleet_util._set_role_maker(role)
    if args.role == "pserver":
        model.run_pserver(args)
    else:

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -22,7 +22,7 @@ from test_dist_fleet_base import TestFleetBase

 class TestDistMnistSync2x2(TestFleetBase):
    def _setup_config(self):
-        self._mode = "sync"
+        self._mode = "async"
        self._reader = "pyreader"

    def check_with_place(self,

--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle
+from op_test import OpTest
+
+
+class TestFlattenOp(OpTest):
+    def setUp(self):
+        self.op_type = "flatten_contiguous_range"
+        self.start_axis = 0
+        self.stop_axis = -1
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.in_shape).astype("float64")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.in_shape).astype("float32")
+        }
+
+    def test_check_output(self):
+        self.check_output(no_check_set=["XShape"])
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = -1
+        self.new_shape = (120)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_1(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 1
+        self.stop_axis = 2
+        self.new_shape = (3, 10, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_2(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = 1
+        self.new_shape = (6, 5, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_3(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = 2
+        self.new_shape = (30, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_4(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = -2
+        self.stop_axis = -1
+        self.new_shape = (3, 2, 20)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_5(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 2
+        self.stop_axis = 2
+        self.new_shape = (3, 2, 5, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOpSixDims(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 3, 2, 4, 4)
+        self.start_axis = 3
+        self.stop_axis = 5
+        self.new_shape = (3, 2, 3, 32)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlatten2OpError(unittest.TestCase):
+    def test_errors(self):
+        image_shape = (2, 3, 4, 4)
+        x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
+                      image_shape[3]).reshape(image_shape) / 100.
+        x = x.astype('float32')
+
+        def test_ValueError1():
+            x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
+            out = paddle.flatten(x_var, start_axis=2, stop_axis=1)
+
+        self.assertRaises(ValueError, test_ValueError1)
+
+        def test_ValueError2():
+            x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
+            paddle.flatten(x_var, start_axis=10, stop_axis=1)
+
+        self.assertRaises(ValueError, test_ValueError2)
+
+        def test_ValueError3():
+            x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
+            paddle.flatten(x_var, start_axis=2, stop_axis=10)
+
+        self.assertRaises(ValueError, test_ValueError3)
+
+        def test_type():
+            # dtype must be float32, float64, int8, int32, int64.
+            x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
+                           image_shape[3]).reshape(image_shape) / 100.
+            x2 = x2.astype('float16')
+            x2_var = paddle.data(name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            paddle.flatten(x2_var)
+
+        self.assertRaises(TypeError, test_type)
+
+        def test_InputError():
+            out = paddle.flatten(x)
+
+        self.assertRaises(ValueError, test_InputError)
+
+
+class TestFlattenPython(unittest.TestCase):
+    def test_python_api(self):
+        image_shape = (2, 3, 4, 4)
+        x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
+                      image_shape[3]).reshape(image_shape) / 100.
+        x = x.astype('float32')
+
+        def test_InputError():
+            out = paddle.flatten(x)
+
+        self.assertRaises(ValueError, test_InputError)
+
+        def test_Negative():
+            paddle.enable_imperative()
+            img = paddle.imperative.to_variable(x)
+            out = paddle.flatten(img, start_axis=-2, stop_axis=-1)
+            return out.numpy().shape
+
+        res_shape = test_Negative()
+        self.assertTrue((2, 3, 16) == res_shape)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
@@ -4,7 +4,6 @@ set -e

 function test_launch_ps(){
    fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
-
    if grep -q "server are killed" ut.elog; then
        echo "test pserver launch succeed"
    else
@@ -20,7 +19,7 @@ fi

 test_launch_ps
 # use default values
-fleetrun multi_process.py
+fleetrun multi_process.py fleetrun

 # use paddlecloud
 echo "begin test use paddlecloud"
@@ -30,16 +29,16 @@ export POD_IP=127.0.0.1
 export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
 export PADDLE_TRAINER_ID=0

-export PADDLE_PORT=35019
+export PADDLE_PORT=35789
 export TRAINER_PORTS_NUM=2

 distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
-CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py
+CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun

-str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
-str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
-file_0="multi_process.check_0.log"
-file_1="multi_process.check_1.log"
+str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
+str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
+file_0="multi_process_fleetrun.check_0.log"
+file_1="multi_process_fleetrun.check_1.log"

 echo "paddlecloud params test"
 if grep -q "$str1" "$file_0"; then
@@ -70,7 +69,7 @@ unset TRAINER_PORTS_NUM

 echo ""
 echo "paddle.distributed.launch async poll process test"
-if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py abort; then
+if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then
    echo "train abort as planned"
 fi


--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
@@ -14,6 +14,7 @@
 """Test cases for role makers."""

 from __future__ import print_function
+import paddle
 import os
 import unittest

@@ -162,7 +163,8 @@ class TestCloudRoleMaker2(unittest.TestCase):
            data = "1 1 1 1\n"
            f.write(data)

-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
        dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"])
        dataset.set_use_var([show, label])
        dataset.load_into_memory()

--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
@@ -40,10 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
            from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
            from paddle.fluid.incubate.fleet.base.role_maker import \
                GeneralRoleMaker
-            from paddle.fluid.incubate.fleet.utils.http_server import KVHandler
-            from paddle.fluid.incubate.fleet.utils.http_server import KVServer
-            from paddle.fluid.incubate.fleet.utils.http_server import \
-                KVHTTPServer
+            from paddle.fleet.utils import KVHandler
+            from paddle.fleet.utils import KVServer
+            from paddle.fleet.utils import KVHTTPServer
        except:
            print("warning: no fleet, skip test_pslib_4")
            return

--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import unittest
+import paddle.fleet.base.role_maker as role_maker
+
+
+class TestRoleMakerBase(unittest.TestCase):
+    """
+    Test cases for RoleMakerBase
+    """
+
+    def test_rolemaker_base(self):
+        role = role_maker.RoleMakerBase()
+        self.assertRaises(Exception, role.is_worker)
+        self.assertRaises(Exception, role.is_server)
+        self.assertRaises(Exception, role.is_first_worker)
+        self.assertRaises(Exception, role.worker_num)
+        self.assertRaises(Exception, role.server_num)
+        self.assertRaises(Exception, role.worker_index)
+        self.assertRaises(Exception, role.server_index)
+        self.assertRaises(Exception, role.role_id)
+
+        trainer_endpoints = role.get_trainer_endpoints()
+        self.assertTrue(len(trainer_endpoints) == 0)
+        pserver_endpoints = role.get_pserver_endpoints()
+        self.assertTrue(len(pserver_endpoints) == 0)
+
+        print(role.to_string())
+        self.assertTrue(role._all_gather(role._node_type_comm, 1) is None)
+        self.assertTrue(role._all_reduce(role._node_type_comm, 1) is None)
+        role._barrier(role._node_type_comm)
+
+
+class TestCloudRoleMaker(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMaker.
+    """
+
+    def setUp(self):
+        """Set up, set envs."""
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+    def test_tr_rolemaker(self):
+        """Test tr rolenamer."""
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_tr_rolemaker")
+            return
+
+        ro = role_maker.PaddleCloudRoleMaker(
+            is_collective=False, init_gloo=False)
+        self.assertTrue(ro.is_worker())
+        self.assertFalse(ro.is_server())
+        self.assertEqual(ro.worker_num(), 2)
+        self.assertTrue(ro.is_first_worker())
+        worker_endpoints = ro.get_trainer_endpoints()
+        self.assertEqual(worker_endpoints[0], '127.0.0.1:36001')
+        self.assertEqual(ro.role_id(), 0)
+
+    def test_tr_rolemaker_collective(self):
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        self.assertEqual(ro.worker_num(), 2)
+
+    def test_ps_rolemaker(self):
+        """Test ps rolemaker."""
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_ps_rolemaker")
+            return
+
+        ro = role_maker.PaddleCloudRoleMaker(
+            is_collective=False, init_gloo=False)
+        self.assertEqual(ro.server_index(), 0)
+        self.assertFalse(ro.is_worker())
+        self.assertTrue(ro.is_server())
+        self.assertEqual(ro.server_num(), 2)
+        pserver_endpoints = ro.get_pserver_endpoints()
+        self.assertEqual(pserver_endpoints[0], '127.0.0.1:36001')
+        self.assertTrue(ro._all_gather(ro._all_comm, 1) is None)
+        self.assertTrue(ro._all_reduce(ro._all_comm, 1) is None)
+
+    def test_traing_role(self):
+        """Test training role."""
+        os.environ["TRAINING_ROLE"] = "TEST"
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_training_role")
+            return
+
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro.generate_role)
+
+
+class TestUserDefinedRoleMaker(unittest.TestCase):
+    """
+    Test cases for UserDefinedRoleMaker.
+    """
+
+    def setUp(self):
+        pass
+
+    def test_ps_rolemaker(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_ps_rolemaker")
+            return
+
+        ro = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            role=role_maker.Role.SERVER,
+            current_id=0,
+            worker_num=2)
+        self.assertEqual(ro.server_num(), 2)
+        ro.generate_role()
+        self.assertTrue(ro.is_server())
+        self.assertEqual(ro.role_id(), 0)
+
+    def test_tr_rolemaker(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_tr_rolemaker")
+            return
+
+        ro = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            role=role_maker.Role.WORKER,
+            current_id=0,
+            worker_num=2)
+        self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints())
+        self.assertTrue(ro.is_worker())
+        self.assertEqual(ro.role_id(), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -12,12 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import unittest
+from __future__ import print_function
 import paddle
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import tarfile
+import tempfile
 import os
+import sys
+from paddle.dataset.common import download, DATA_HOME
+from paddle.fleet.base.util_factory import fleet_util
+import paddle.fleet.base.role_maker as role_maker


 class TestFleetUtil(unittest.TestCase):
+    proto_data_url = "https://fleet.bj.bcebos.com/fleet_util_data.tgz"
+    proto_data_md5 = "59b7f12fd9dc24b64ae8e4629523a92a"
+    module_name = "fleet_util_data"
+    pruned_dir = os.path.join("fleet_util_data", "pruned_model")
+    train_dir = os.path.join("fleet_util_data", "train_program")
+
    def test_util_base(self):
        import paddle.fleet as fleet
        util = fleet.UtilBase()
@@ -65,6 +80,262 @@ class TestFleetUtil(unittest.TestCase):
        user_id = fleet.util.get_user_id()
        self.assertEqual(user_id, 10)

+    def test_fs(self):
+        from paddle.fleet.utils import LocalFS
+        fs = LocalFS()
+        dirs, files = fs.ls_dir("test_tmp")
+        dirs, files = fs.ls_dir("./")
+        self.assertFalse(fs.need_upload_download())
+        fleet_util.set_file_system(fs)
+
+    def test_barrier(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_barrier")
+            return
+
+        gloo = fluid.core.Gloo()
+        gloo.set_rank(0)
+        gloo.set_size(1)
+        gloo.set_prefix("123")
+        gloo.set_iface("lo")
+        gloo.set_hdfs_store("./tmp_test_fleet_barrier", "", "")
+        gloo.init()
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.SERVER,
+            worker_endpoints=["127.0.0.1:6003"],
+            server_endpoints=["127.0.0.1:6001"])
+        role._node_type_comm = gloo
+        role._role_is_generated = True
+        fleet_util._set_role_maker(role)
+
+        fleet_util.barrier("worker")
+
+    def test_all_reduce(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_all_reduce")
+            return
+
+        gloo = fluid.core.Gloo()
+        gloo.set_rank(0)
+        gloo.set_size(1)
+        gloo.set_prefix("123")
+        gloo.set_iface("lo")
+        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
+        gloo.init()
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.WORKER,
+            worker_endpoints=["127.0.0.1:6003"],
+            server_endpoints=["127.0.0.1:6001"])
+        role._node_type_comm = gloo
+        role._role_is_generated = True
+        fleet_util._set_role_maker(role)
+
+        output = fleet_util.all_reduce(1, "sum", comm_world="server")
+        print(output)
+
+    # self.assertEqual(output, 1)
+
+    def test_all_gather(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_all_gather")
+            return
+
+        gloo = fluid.core.Gloo()
+        gloo.set_rank(0)
+        gloo.set_size(1)
+        gloo.set_prefix("123")
+        gloo.set_iface("lo")
+        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
+        gloo.init()
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.SERVER,
+            worker_endpoints=["127.0.0.1:6003"],
+            server_endpoints=["127.0.0.1:6001"])
+        role._node_type_comm = gloo
+        role._all_comm = gloo
+        role._role_is_generated = True
+        fleet_util._set_role_maker(role)
+
+        output = fleet_util.all_gather(1, comm_world="all")
+        print(output)
+        # self.assertTrue(len(output) == 1 and output[0] == 1)
+        self.assertRaises(Exception, fleet_util.all_gather, 1, "test")
+
+    def download_files(self):
+        path = download(self.proto_data_url, self.module_name,
+                        self.proto_data_md5)
+        print('data is downloaded at ' + path)
+        tar = tarfile.open(path)
+        unzip_folder = tempfile.mkdtemp()
+        tar.extractall(unzip_folder)
+        return unzip_folder
+
+    def test_get_file_shard(self):
+        self.assertRaises(Exception, fleet_util.get_file_shard, "files")
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_get_file_shard")
+            return
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.WORKER,
+            worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+        fleet_util._set_role_maker(role)
+        files = fleet_util.get_file_shard(["1", "2", "3"])
+        self.assertTrue(len(files) == 2 and "1" in files and "2" in files)
+
+    def test_program_type_trans(self):
+        data_dir = self.download_files()
+        program_dir = os.path.join(data_dir, self.pruned_dir)
+        text_program = "pruned_main_program.pbtxt"
+        binary_program = "pruned_main_program.bin"
+        text_to_binary = fleet_util._program_type_trans(program_dir,
+                                                        text_program, True)
+        binary_to_text = fleet_util._program_type_trans(program_dir,
+                                                        binary_program, False)
+        self.assertTrue(
+            os.path.exists(os.path.join(program_dir, text_to_binary)))
+        self.assertTrue(
+            os.path.exists(os.path.join(program_dir, binary_to_text)))
+
+    def test_prams_check(self):
+        data_dir = self.download_files()
+
+        class config:
+            pass
+
+        feed_config = config()
+        feed_config.feeded_vars_names = ['concat_1.tmp_0', 'concat_2.tmp_0']
+        feed_config.feeded_vars_dims = [682, 1199]
+        feed_config.feeded_vars_types = [np.float32, np.float32]
+        feed_config.feeded_vars_filelist = [
+            os.path.join(data_dir, os.path.join(self.pruned_dir, "concat_1")),
+            os.path.join(data_dir, os.path.join(self.pruned_dir, "concat_2"))
+        ]
+
+        fetch_config = config()
+        fetch_config.fetch_vars_names = ['similarity_norm.tmp_0']
+
+        conf = config()
+        conf.batch_size = 1
+        conf.feed_config = feed_config
+        conf.fetch_config = fetch_config
+        conf.dump_model_dir = os.path.join(data_dir, self.pruned_dir)
+        conf.dump_program_filename = "pruned_main_program.pbtxt"
+        conf.is_text_dump_program = True
+        conf.save_params_filename = None
+
+        # test saved var's shape
+        conf.dump_program_filename = "pruned_main_program.save_var_shape_not_match"
+
+        self.assertRaises(Exception, fleet_util._params_check)
+
+        # test program.proto without feed_op and fetch_op
+        conf.dump_program_filename = "pruned_main_program.no_feed_fetch"
+        results = fleet_util._params_check(conf)
+        self.assertTrue(len(results) == 1)
+        np.testing.assert_array_almost_equal(
+            results[0], np.array(
+                [[3.0590223e-07]], dtype=np.float32))
+
+        # test feed_var's shape
+        conf.dump_program_filename = "pruned_main_program.feed_var_shape_not_match"
+        self.assertRaises(Exception, fleet_util._params_check)
+
+        # test correct case with feed_vars_filelist
+        conf.dump_program_filename = "pruned_main_program.pbtxt"
+        results = fleet_util._params_check(conf)
+        self.assertTrue(len(results) == 1)
+        np.testing.assert_array_almost_equal(
+            results[0], np.array(
+                [[3.0590223e-07]], dtype=np.float32))
+
+        # test correct case without feed_vars_filelist
+        conf.feed_config.feeded_vars_filelist = None
+        # test feed var with lod_level >= 2
+        conf.dump_program_filename = "pruned_main_program.feed_lod2"
+        self.assertRaises(Exception, fleet_util._params_check)
+
+        conf.dump_program_filename = "pruned_main_program.pbtxt"
+        results = fleet_util._params_check(conf)
+        self.assertTrue(len(results) == 1)
+
+    def test_proto_check(self):
+        data_dir = self.download_files()
+
+        class config:
+            pass
+
+        conf = config()
+        conf.train_prog_path = os.path.join(
+            data_dir, os.path.join(self.train_dir, "join_main_program.pbtxt"))
+        conf.is_text_train_program = True
+
+        # test not match
+        conf.pruned_prog_path = os.path.join(
+            data_dir,
+            os.path.join(self.pruned_dir,
+                         "pruned_main_program.save_var_shape_not_match"))
+        conf.is_text_pruned_program = True
+        conf.draw = False
+        res = fleet_util._proto_check(conf)
+        self.assertFalse(res)
+
+        # test match
+        conf.pruned_prog_path = os.path.join(
+            data_dir,
+            os.path.join(self.pruned_dir, "pruned_main_program.pbtxt"))
+        if sys.platform == 'win32' or sys.platform == 'sys.platform':
+            conf.draw = False
+        else:
+            conf.draw = True
+            conf.draw_out_name = "pruned_check"
+        res = fleet_util._proto_check(conf)
+        self.assertTrue(res)
+
+    def test_visualize(self):
+        if sys.platform == 'win32' or sys.platform == 'sys.platform':
+            pass
+        else:
+            data_dir = self.download_files()
+            program_path = os.path.join(
+                data_dir,
+                os.path.join(self.train_dir, "join_main_program.pbtxt"))
+            is_text = True
+            program = fleet_util._load_program(program_path, is_text)
+            output_dir = os.path.join(data_dir, self.train_dir)
+            output_filename = "draw_prog"
+            fleet_util._visualize_graphviz(program, output_dir, output_filename)
+            self.assertTrue(
+                os.path.exists(
+                    os.path.join(output_dir, output_filename + ".dot")))
+            self.assertTrue(
+                os.path.exists(
+                    os.path.join(output_dir, output_filename + ".pdf")))
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -20,9 +20,7 @@ import os
 import sys
 import inspect

-from paddle.fluid.incubate.fleet.utils.fs import LocalFS, FS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
-from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError


 class FSTest(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_hdfs.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs.py
@@ -19,9 +19,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, T
 import os
 import sys

-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
-from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError

 java_home = os.environ["JAVA_HOME"]


--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -14,13 +14,15 @@

 from __future__ import print_function

+import os
 import unittest
 import numpy as np

 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
-from paddle.fluid.dygraph import declarative
+from paddle.fluid.dygraph import declarative, ProgramTranslator
+from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME

 BATCH_SIZE = 32
 BATCH_NUM = 20
@@ -77,8 +79,8 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):

 def train(layer):
    # create optimizer
-    adam = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.1, parameter_list=layer.parameters())
+    adam = fluid.optimizer.SGDOptimizer(
+        learning_rate=0.01, parameter_list=layer.parameters())
    # create data loader
    train_loader = fluid.io.DataLoader.from_generator(capacity=5)
    train_loader.set_batch_generator(random_batch_reader())
@@ -111,37 +113,43 @@ class TestJitSaveLoad(unittest.TestCase):
        # config seed
        fluid.default_main_program().random_seed = SEED

-    def train_and_save_model(self):
+    def train_and_save_model(self, model_path=None, configs=None):
        layer = LinearNet(784, 1)
        example_inputs, layer, _ = train(layer)
+        final_model_path = model_path if model_path else self.model_path
        orig_input_types = [type(x) for x in example_inputs]
        fluid.dygraph.jit.save(
-            layer=layer, model_path=self.model_path, input_spec=example_inputs)
+            layer=layer,
+            model_path=final_model_path,
+            input_spec=example_inputs,
+            configs=configs)
        new_input_types = [type(x) for x in example_inputs]
        self.assertEqual(orig_input_types, new_input_types)
        return layer

-    def test_save(self):
-        # train and save model
-        self.train_and_save_model()
-
-    def test_load_infernece(self):
+    def test_save_load(self):
        # train and save model
        train_layer = self.train_and_save_model()
        # load model
-        infer_layer = fluid.dygraph.jit.load(self.model_path)
+        program_translator = ProgramTranslator()
+        program_translator.enable(False)
+        loaded_layer = fluid.dygraph.jit.load(self.model_path)
+        self.load_and_inference(train_layer, loaded_layer)
+        self.load_dygraph_state_dict(train_layer)
+        self.load_and_finetune(train_layer, loaded_layer)
+        program_translator.enable(True)
+
+    def load_and_inference(self, train_layer, infer_layer):
        train_layer.eval()
+        infer_layer.eval()
        # inference & compare
        x = fluid.dygraph.to_variable(
            np.random.random((1, 784)).astype('float32'))
        self.assertTrue(
            np.array_equal(train_layer(x).numpy(), infer_layer(x).numpy()))

-    def test_load_finetune(self):
-        # train and save model
-        train_layer = self.train_and_save_model()
-        # load model
-        load_train_layer = fluid.dygraph.jit.load(self.model_path)
+    def load_and_finetune(self, train_layer, load_train_layer):
+        train_layer.train()
        load_train_layer.train()
        # train & compare
        _, _, train_loss = train(train_layer)
@@ -149,6 +157,19 @@ class TestJitSaveLoad(unittest.TestCase):
        self.assertTrue(
            np.array_equal(train_loss.numpy(), load_train_loss.numpy()))

+    def load_dygraph_state_dict(self, train_layer):
+        train_layer.eval()
+        # contruct new model
+        new_layer = LinearNet(784, 1)
+        model_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
+        new_layer.set_dict(model_dict)
+        new_layer.eval()
+        # inference & compare
+        x = fluid.dygraph.to_variable(
+            np.random.random((1, 784)).astype('float32'))
+        self.assertTrue(
+            np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
+
    def test_save_get_program_failed(self):
        layer = LinearNetNotDeclarative(784, 1)
        example_inputs, layer, _ = train(layer)
@@ -158,6 +179,31 @@ class TestJitSaveLoad(unittest.TestCase):
                model_path=self.model_path,
                input_spec=example_inputs)

+    def test_load_dygraoh_no_path(self):
+        model_path = "model.test_jit_save_load.no_path"
+        new_layer = LinearNet(784, 1)
+        with self.assertRaises(ValueError):
+            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
+
+    def test_load_dygraph_no_var_info(self):
+        model_path = "model.test_jit_save_load.no_var_info"
+        self.train_and_save_model(model_path=model_path)
+        # remove `__variables.info__`
+        var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        os.remove(var_info_path)
+        new_layer = LinearNet(784, 1)
+        with self.assertRaises(RuntimeError):
+            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
+
+    def test_load_dygraph_not_var_file(self):
+        model_path = "model.test_jit_save_load.no_var_file"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        configs.params_filename = "__params__"
+        self.train_and_save_model(model_path=model_path, configs=configs)
+        new_layer = LinearNet(784, 1)
+        with self.assertRaises(RuntimeError):
+            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
+

 class TestJitSaveLoadConfig(unittest.TestCase):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@@ -3,7 +3,7 @@ set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
 launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
-python ${launch_py} multi_process.py
+python ${launch_py} multi_process.py launch

 # use paddlecloud
 echo "begin test use paddlecloud"
@@ -18,12 +18,12 @@ export PADDLE_PORT=35019
 export TRAINER_PORTS_NUM=2

 distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
-CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py
+CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch

 str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
 str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
-file_0="multi_process.check_0.log"
-file_1="multi_process.check_1.log"
+file_0="multi_process_launch.check_0.log"
+file_1="multi_process_launch.check_1.log"

 echo "paddlecloud params test"
 if grep -q "$str1" "$file_0"; then
@@ -54,7 +54,7 @@ unset TRAINER_PORTS_NUM

 echo ""
 echo "paddle.distributed.launch async poll process test"
-if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py abort; then
+if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch abort; then
    echo "train abort as planned"
 fi


--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -180,6 +180,51 @@ class TestLayer(LayerTest):

            self.assertRaises(TypeError, test_type)

+    def test_Flatten(self):
+        inp = np.ones([3, 4, 4, 5], dtype='float32')
+        with self.static_graph():
+            t = layers.data(
+                name='data',
+                shape=[3, 4, 4, 5],
+                dtype='float32',
+                append_batch_size=False)
+            flatten = nn.Flatten()
+            ret = flatten(t)
+            static_ret = self.get_static_graph_result(
+                feed={'data': inp}, fetch_list=[ret])[0]
+        with self.dynamic_graph():
+            t = base.to_variable(inp)
+            flatten = nn.Flatten()
+            dy_ret = flatten(t)
+            dy_ret_value = dy_ret.numpy()
+
+        self.assertTrue(np.array_equal(static_ret, dy_ret_value))
+
+        with self.static_graph():
+
+            # the input of Linear must be Variable.
+            def test_Variable():
+                inp = np.ones([3, 32, 32], dtype='float32')
+                linear = nn.Linear(
+                    32,
+                    4,
+                    bias_attr=fluid.initializer.ConstantInitializer(value=1))
+                linear_ret1 = linear(inp)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            # the input dtype of Linear must be float16 or float32 or float64
+            # float16 only can be set on GPU place
+            def test_type():
+                inp = np.ones([3, 32, 32], dtype='int32')
+                linear = nn.Linear(
+                    32,
+                    4,
+                    bias_attr=fluid.initializer.ConstantInitializer(value=1))
+                linear_ret2 = linear(inp)
+
+            self.assertRaises(TypeError, test_type)
+
    def test_layer_norm(self):
        inp = np.ones([3, 32, 32], dtype='float32')
        with self.static_graph():

--- a/python/paddle/fluid/tests/unittests/test_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_monitor.py
@@ -16,6 +16,7 @@ TestCases for Monitor
 """

 from __future__ import print_function
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import numpy as np
@@ -51,7 +52,8 @@ class TestDatasetWithStat(unittest.TestCase):
                name=slot, shape=[1], dtype="int64", lod_level=1)
            slots_vars.append(var)

-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist([

--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.

 from __future__ import print_function
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import numpy as np

--- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
@@ -63,7 +63,7 @@ def case_generator(op_type, Xshape, diagonal, expected):
        "diagonal: TypeError":
        "diagonal in {} must be a python Int".format(op_type),
        "input: ValueError":
-        "input shape in {} must be at least 2-D".format(op_type),
+        "x shape in {} must be at least 2-D".format(op_type),
    }

    class FailureCase(unittest.TestCase):
@@ -71,7 +71,7 @@ def case_generator(op_type, Xshape, diagonal, expected):
            data = fluid.data(shape=Xshape, dtype='float64', name=cls_name)
            with self.assertRaisesRegexp(
                    eval(expected.split(':')[-1]), errmsg[expected]):
-                getattr(tensor, op_type)(input=data, diagonal=diagonal)
+                getattr(tensor, op_type)(x=data, diagonal=diagonal)

    class SuccessCase(TrilTriuOpDefaultTest):
        def initTestCase(self):

--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -17,6 +17,7 @@ no_check_set_white_list = [
    'fake_quantize_range_abs_max',
    'coalesce_tensor',
    'flatten2',
+    'flatten_contiguous_range',
    'lrn',
    'squeeze2',
    'reshape2',

--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -63,6 +63,7 @@ from .layer.common import Pool2D  #DEFINE_ALIAS
 from .layer.common import Pad2D  #DEFINE_ALIAS
 from .layer.common import Embedding  #DEFINE_ALIAS
 from .layer.common import Linear  #DEFINE_ALIAS
+from .layer.common import Flatten  #DEFINE_ALIAS
 from .layer.common import UpSample  #DEFINE_ALIAS
 from .layer.conv import Conv2D  #DEFINE_ALIAS
 from .layer.conv import Conv2DTranspose  #DEFINE_ALIAS

--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -39,6 +39,7 @@ from .common import Pool2D  #DEFINE_ALIAS
 from .common import Pad2D  #DEFINE_ALIAS
 from .common import Embedding  #DEFINE_ALIAS
 from .common import Linear  #DEFINE_ALIAS
+from .common import Flatten  #DEFINE_ALIAS
 from .common import UpSample  #DEFINE_ALIAS
 from .conv import Conv2D  #DEFINE_ALIAS
 from .conv import Conv2DTranspose  #DEFINE_ALIAS

--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -17,6 +17,7 @@ from ...fluid.dygraph import BilinearTensorProduct  #DEFINE_ALIAS
 from ...fluid.dygraph import Pool2D  #DEFINE_ALIAS
 from ...fluid.dygraph import Embedding  #DEFINE_ALIAS
 from ...fluid.dygraph import Linear  #DEFINE_ALIAS
+from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
 from ...fluid.dygraph import layers
 from .. import functional as F


--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -490,14 +490,13 @@ def _tril_triu_op(helper):
    """Base op of tril_op and triu_op
    """
    op_type = helper.layer_type
-    x = helper.kwargs.get('input', None)
+    x = helper.kwargs.get('x', None)

    assert x is not None, 'x cannot be None in {}'.format(op_type)
    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
                             op_type)
    if len(x.shape) < 2:
-        raise ValueError("input shape in {} must be at least 2-D".format(
-            op_type))
+        raise ValueError("x shape in {} must be at least 2-D".format(op_type))
    diagonal = helper.kwargs.get('diagonal', 0)
    if not isinstance(diagonal, (int, )):
        raise TypeError("diagonal in {} must be a python Int".format(op_type))
@@ -521,18 +520,18 @@ def _tril_triu_op(helper):
    return out


-def tril(input, diagonal=0, name=None):
+def tril(x, diagonal=0, name=None):
    """
 	:alias_main: paddle.tril
 	:alias: paddle.tril,paddle.tensor.tril,paddle.tensor.creation.tril

    This op returns the lower triangular part of a matrix (2-D tensor) or batch
-    of matrices :attr:`input`, the other elements of the result tensor are set 
+    of matrices :attr:`x`, the other elements of the result tensor are set 
    to 0. The lower triangular part of the matrix is defined as the elements 
    on and below the diagonal.

    Args:
-        input (Variable): The input variable which is a Tensor.
+        x (Variable): The input variable x which is a Tensor.
            Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
        diagonal (int, optional): The diagonal to consider, default value is 0.
            If :attr:`diagonal` = 0, all elements on and below the main diagonal are
@@ -545,47 +544,41 @@ def tril(input, diagonal=0, name=None):
            user to set this property. For more information, please refer to :ref:`api_guide_Name`.

    Returns:
-        Variable: Tensor, results of lower triangular operation by the specified diagonal of input tensor,
-        it's data type is the same as input's Tensor.
+        Variable: Tensor, results of lower triangular operation by the specified diagonal of input tensor x,
+        it's data type is the same as x's Tensor.

    Raises:
        TypeError: diagonal is not a int type.
-        ValueError: dimension of :attr:`input` is less than 2.
+        ValueError: dimension of :attr:`x` is less than 2.

    Examples:
        .. code-block:: python

            import numpy as np
-            import paddle.tensor as tensor
-            import paddle.fluid as fluid
+            import paddle

            data = np.arange(1, 13, dtype="int64").reshape(3,-1)
            # array([[ 1,  2,  3,  4],
            #        [ 5,  6,  7,  8],
            #        [ 9, 10, 11, 12]])
-            x = fluid.data(shape=(-1, 4), dtype='int64', name='x')
-            exe = fluid.Executor(fluid.CPUPlace())

-            # example 1, default diagonal
-            tril = tensor.tril(x)
-            tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[tril], return_numpy=True)
+            paddle.enable_imperative()
+
+            x = paddle.imperative.to_variable(data)
+            
+            tril1 = paddle.tensor.tril(x)
            # array([[ 1,  0,  0,  0],
            #        [ 5,  6,  0,  0],
            #        [ 9, 10, 11,  0]])

            # example 2, positive diagonal value
-            tril = tensor.tril(x, diagonal=2)
-            tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[tril], return_numpy=True)
+            tril2 = paddle.tensor.tril(x, diagonal=2)
            # array([[ 1,  2,  3,  0], 
            #        [ 5,  6,  7,  8],
            #        [ 9, 10, 11, 12]])

            # example 3, negative diagonal value
-            tril = tensor.tril(x, diagonal=-1)
-            tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[tril], return_numpy=True)
+            tril3 = paddle.tensor.tril(x, diagonal=-1)
            # array([[ 0,  0,  0,  0],
            #        [ 5,  0,  0,  0],
            #        [ 9, 10,  0,  0]])
@@ -593,23 +586,23 @@ def tril(input, diagonal=0, name=None):
    """
    if in_dygraph_mode():
        op = getattr(core.ops, 'tril_triu')
-        return op(input, 'diagonal', diagonal, "lower", True)
+        return op(x, 'diagonal', diagonal, "lower", True)

    return _tril_triu_op(LayerHelper('tril', **locals()))


-def triu(input, diagonal=0, name=None):
+def triu(x, diagonal=0, name=None):
    """
 	:alias_main: paddle.triu
 	:alias: paddle.triu,paddle.tensor.triu,paddle.tensor.creation.triu

    This op returns the upper triangular part of a matrix (2-D tensor) or batch of matrices
-    :attr:`input`, the other elements of the result tensor are set to 0.
+    :attr:`x`, the other elements of the result tensor are set to 0.
    The upper triangular part of the matrix is defined as the elements on and
    above the diagonal.

    Args:
-        input (Variable): The input variable which is a Tensor.
+        x (Variable): The input variable x which is a Tensor.
            Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
        diagonal (int, optional): The diagonal to consider, default value is 0.
            If :attr:`diagonal` = 0, all elements on and above the main diagonal are
@@ -622,47 +615,41 @@ def triu(input, diagonal=0, name=None):
            user to set this property. For more information, please refer to :ref:`api_guide_Name`.

    Returns:
-        Variable: Tensor, results of upper triangular operation by the specified diagonal of input tensor,
-        it's data type is the same as input's Tensor.
+        Variable: Tensor, results of upper triangular operation by the specified diagonal of input tensor x,
+        it's data type is the same as x's Tensor.

    Raises:
        TypeError: diagonal is not a int type.
-        ValueError: dimension of :attr:`input` is less than 2.
+        ValueError: dimension of :attr:`x` is less than 2.

    Examples:
        .. code-block:: python

            import numpy as np
-            import paddle.fluid as fluid
-            import paddle.tensor as tensor
+            import paddle

            data = np.arange(1, 13, dtype="int64").reshape(3,-1)
            # array([[ 1,  2,  3,  4],
            #        [ 5,  6,  7,  8],
            #        [ 9, 10, 11, 12]])
-            x = fluid.data(shape=(-1, 4), dtype='int64', name='x')
-            exe = fluid.Executor(fluid.CPUPlace())
+
+            paddle.enable_imperative()

            # example 1, default diagonal
-            triu = tensor.triu(x)
-            triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[triu], return_numpy=True)
+            x = paddle.imperative.to_variable(data)
+            triu1 = paddle.tensor.triu(x)
            # array([[ 1,  2,  3,  4],
            #        [ 0,  6,  7,  8],
            #        [ 0,  0, 11, 12]])

            # example 2, positive diagonal value
-            triu = tensor.triu(x, diagonal=2)
-            triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[triu], return_numpy=True)
+            triu2 = paddle.tensor.triu(x, diagonal=2)
            # array([[0, 0, 3, 4],
            #        [0, 0, 0, 8],
            #        [0, 0, 0, 0]])

            # example 3, negative diagonal value
-            triu = tensor.triu(x, diagonal=-1)
-            triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[triu], return_numpy=True)
+            triu3 = paddle.tensor.triu(x, diagonal=-1)
            # array([[ 1,  2,  3,  4],
            #        [ 5,  6,  7,  8],
            #        [ 0, 10, 11, 12]])
@@ -670,7 +657,7 @@ def triu(input, diagonal=0, name=None):
    """
    if in_dygraph_mode():
        op = getattr(core.ops, 'tril_triu')
-        return op(input, 'diagonal', diagonal, "lower", False)
+        return op(x, 'diagonal', diagonal, "lower", False)

    return _tril_triu_op(LayerHelper('triu', **locals()))


--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -729,26 +729,32 @@ def bmm(x, y, name=None):

    Examples:
        import paddle
-        import paddle.fluid as fluid
-        x = fluid.layers.data(name='x', shape=[10, 3, 4], dtype='float32')
-        y = fluid.layers.data(name='y', shape=[10, 4, 5], dtype='float32')
-        out = paddle.bmm(x, y)

-        # In dygraph mode:
+        # In imperative mode:
        # size input1: (2, 2, 3) and input2: (2, 3, 2)
        input1 = np.array([[[1.0, 1.0, 1.0],[2.0, 2.0, 2.0]],[[3.0, 3.0, 3.0],[4.0, 4.0, 4.0]]])
        input2 = np.array([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],[[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])

-        with fluid.dygraph.guard():
-            x = fluid.dygraph.to_variable(input1)
-            y = fluid.dygraph.to_variable(input2)
+        paddle.enable_imperative()
+        
+        x = paddle.imperative.to_variable(input1)
+        y = paddle.imperative.to_variable(input2)
        out = paddle.bmm(x, y)
        #output size: (2, 2, 2)
        #output value:
        #[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]]
        out_np = out.numpy()
    """
-
+    x_shape = x.shape
+    y_shape = y.shape
+    if not len(x_shape) == len(y_shape) == 3:
+        raise ValueError(
+            "x and y should be 3-dimensional. But received x's dimention: {}, y's dimention: {}".
+            format(x_shape, y_shape))
+    if x_shape[2] != y_shape[1]:
+        raise ValueError(
+            "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}".
+            format(x_shape, y_shape))
    helper = LayerHelper('bmm', **locals())
    if in_dygraph_mode():
        return core.ops.bmm(x, y)

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -25,7 +25,6 @@ import numpy as np
 from ..fluid.layers import cast  #DEFINE_ALIAS
 from ..fluid.layers import expand  #DEFINE_ALIAS
 from ..fluid.layers import expand_as  #DEFINE_ALIAS
-from ..fluid.layers import flatten  #DEFINE_ALIAS
 from ..fluid.layers import reshape  #DEFINE_ALIAS
 from ..fluid.layers import scatter  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
@@ -169,6 +168,114 @@ def flip(x, axis, name=None):
 reverse = flip  #DEFINE_ALIAS


+def flatten(x, start_axis=0, stop_axis=-1, name=None):
+    """
+    **Flatten op**
+
+    Flattens a contiguous range of axes in a tensor according to start_axis and stop_axis.
+
+    For Example:
+
+    .. code-block:: text
+
+        Case 1:
+
+          Given
+            X.shape = (3, 100, 100, 4)
+
+          and
+            start_axis = 1
+            end_axis = 2
+
+          We get:
+            Out.shape = (3, 1000 * 100, 2)
+
+        Case 2:
+
+          Given
+            X.shape = (3, 100, 100, 4)
+
+          and
+            start_axis = 0
+            stop_axis = -1
+
+          We get:
+            Out.shape = (3 * 100 * 100 * 4)
+
+    Args:
+        x (Variable): A tensor of number of dimentions >= axis. A tensor with data type float32,
+                      float64, int8, int32, int64.
+        start_axis (int): the start axis to flatten
+        stop_axis (int): the stop axis to flatten
+        name(str, Optional): For details, please refer to :ref:`api_guide_Name`.
+                        Generally, no setting is required. Default: None.
+
+    Returns:
+        Variable: A tensor with the contents of the input tensor, with input \
+                  axes flattened by indicated start axis and end axis. \
+                  A Tensor with data type same as input x.
+
+    Raises:
+        ValueError: If x is not a Variable.
+        ValueError: If start_axis or stop_axis is illegal.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.enable_imperative()
+
+            image_shape=(2, 3, 4, 4)
+            x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]).reshape(image_shape) / 100.
+            x = x.astype('float32')
+            
+            img = paddle.imperative.to_variable(x)
+            out = paddle.flatten(img, start_axis=1, stop_axis=2)
+            # out shape is [2, 12, 4]
+    """
+    if not (isinstance(x, Variable)):
+        raise ValueError("The input x should be a Variable")
+
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64'], 'flatten')
+    helper = LayerHelper('flatten', **locals())
+
+    x_dim = len(x.shape)
+    if not (isinstance(start_axis, int)) or (
+            start_axis > x_dim - 1) or start_axis < -x_dim:
+        raise ValueError(
+            "The start_axis should be a int, and in range [-rank(x), rank(x))")
+    if not (isinstance(stop_axis, int)) or (
+            stop_axis > x_dim - 1) or stop_axis < -x_dim:
+        raise ValueError(
+            "The stop_axis should be a int, and in range [-rank(x), rank(x))")
+    if start_axis < 0:
+        start_axis = start_axis + x_dim
+    if stop_axis < 0:
+        stop_axis = stop_axis + x_dim
+    if start_axis > stop_axis:
+        raise ValueError("The stop_axis should be larger than stat_axis")
+
+    if in_dygraph_mode():
+        dy_out, _ = core.ops.flatten_contiguous_range(
+            x, 'start_axis', start_axis, 'stop_axis', stop_axis)
+        return dy_out
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    x_shape = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='flatten_contiguous_range',
+        inputs={"X": x},
+        outputs={'Out': out,
+                 'XShape': x_shape},
+        attrs={"start_axis": start_axis,
+               "stop_axis": stop_axis})
+    return out
+
+
 def roll(x, shifts, axis=None, name=None):
    """
 	:alias_main: paddle.roll
@@ -252,13 +359,18 @@ def roll(x, shifts, axis=None, name=None):
    return out


-def stack(x, axis=0, out=None, name=None):
+def stack(x, axis=0, name=None):
    """
 	:alias_main: paddle.stack
-	:alias: paddle.stack,paddle.tensor.stack,paddle.tensor.manipulation.stack
+	:alias: paddle.stack, paddle.tensor.stack, paddle.tensor.manipulation.stack

+    This OP stacks all the input tensors ``x`` along ``axis`` dimemsion. 
+    All tensors must be of the same shape and same dtype.
+    
+    For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked 
+    tensor is [N, A, B]; if ``axis == 1``, the shape of stacked 
+    tensor is [A, N, B], etc.
    
-    This OP stacks all the inputs :code:`x` along axis.

    .. code-block:: text

@@ -284,7 +396,6 @@ def stack(x, axis=0, out=None, name=None):

        Case 2:

-
          Input:
            x[0].shape = [1, 2]
            x[0].data = [ [1.0 , 2.0 ] ]
@@ -295,7 +406,7 @@ def stack(x, axis=0, out=None, name=None):


          Attrs:
-            axis = 1 or axis = -2
+            axis = 1 or axis = -2  # If axis = -2, axis = axis+ndim(x[0])+1 = -2+2+1 = 1.

          Output:
            Out.shape = [1, 3, 2]
@@ -304,65 +415,40 @@ def stack(x, axis=0, out=None, name=None):
                          [5.0, 6.0] ] ]

    Args:
-        x (Variable|list(Variable)): Input :code:`x` can be a single Tensor, a :code:`list` of Tensors.
-                                     If :code:`x` is a :code:`list`, the shapes of all these Tensors
-                                     must be the same. Supposing input is N dims
-                                     Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims
-                                     Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
-                                     Support data types: float32, float64, int32, int64.
-        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`.
-                              R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`.
-                              The default value of axis is 0.
+        x (Tensor|list[Tensor]): Input ``x`` can be a single tensor, or a ``list`` of tensors.
+                                     If ``x`` is a ``list``, the Tensors in ``x``
+                                     must be of the same shape and dtype. Support data types: float32, float64, int32, int64.
+        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
+                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
+                              If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
+        name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
        
    Returns:
-        Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`.
+        Tensor: The stacked tensor with same data type as input.

    Example:    
        .. code-block:: python
-            import numpy as np
+
            import paddle
-            import paddle.fluid as fluid
+            import numpy as np

            data1 = np.array([[1.0, 2.0]])
            data2 = np.array([[3.0, 4.0]])
            data3 = np.array([[5.0, 6.0]])
-            with fluid.dygraph.guard():
-                x1 = fluid.dygraph.to_variable(data1)
-                x2 = fluid.dygraph.to_variable(data2)
-                x3 = fluid.dygraph.to_variable(data3)
-                result = paddle.stack([x1, x2, x3], axis=0)
-                # result shape: [3, 1, 2]
-                # result value: [[[1.0, 2.0]],
-                #                [[3.0, 4.0]],
-                #                [[5.0, 6.0]]]
-    """
-
-    helper = LayerHelper('stack', **locals())
-    axis = 0 if axis is None else axis
-
-    if not isinstance(x, list) and not isinstance(x, tuple):
-        x = [x]
-    out = helper.create_variable_for_type_inference(x[0].dtype)
-    if not in_dygraph_mode() and \
-            x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
-        assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
-                            "number of the elements must be 1, but received %s." % len(x)
-        out_index = helper.create_variable_for_type_inference(dtype="int32")
-        helper.append_op(
-            type='tensor_array_to_tensor',
-            inputs={'X': x[0]},
-            outputs={'Out': [out],
-                     'OutIndex': [out_index]},
-            attrs={'axis': axis,
-                   'use_stack': True})
-    else:
-        helper.append_op(
-            type='stack',
-            inputs={'X': x},
-            outputs={'Y': out},
-            attrs={'axis': axis})

-    return out
+            paddle.enable_imperative()
+            x1 = paddle.imperative.to_variable(data1)
+            x2 = paddle.imperative.to_variable(data2)
+            x3 = paddle.imperative.to_variable(data3)
+
+            out = paddle.stack([x1, x2, x3], axis=0)
+            print(out.shape)  # [3, 1, 2]
+            print(out.numpy())
+            # [[[1., 2.]],
+            #  [[3., 4.]],
+            #  [[5., 6.]]]
+    """
+    return layers.stack(x, axis, name)


 def split(x, num_or_sections, axis=0, name=None):

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -915,7 +915,7 @@ def mm(input, mat2, name=None):
    return out


-def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
+def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
    """
 	:alias_main: paddle.addmm
 	:alias: paddle.addmm,paddle.tensor.addmm,paddle.tensor.math.addmm
@@ -935,8 +935,8 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
        input (Variable): The input Tensor/LoDTensor to be added to the final result.
        x (Variable): The first input Tensor/LoDTensor for matrix multiplication.
        y (Variable): The second input Tensor/LoDTensor for matrix multiplication.
-        alpha (float): Coefficient of $x*y$.
        beta (float): Coefficient of $input$.
+        alpha (float): Coefficient of $x*y$.
        name (str, optional): Name of the output. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default is None.

    Returns:
@@ -947,25 +947,43 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):

            import numpy as np
            import paddle
-            import paddle.fluid as fluid
-
-            input = fluid.data(name='input', shape=[2, 2], dtype='float32')
-            x = fluid.data(name='x', shape=[2, 2], dtype='float32')
-            y = fluid.data(name='y', shape=[2, 2], dtype='float32')
-            out = paddle.addmm( input=input, x=x, y=y, alpha=5.0, beta=0.5 )

            data_x = np.ones((2, 2)).astype(np.float32)
            data_y = np.ones((2, 2)).astype(np.float32)
            data_input = np.ones((2, 2)).astype(np.float32)

-            place =  fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            results = exe.run(fluid.default_main_program(),
-                              fetch_list=[out], feed={"input": data_input, 'x': data_x, "y": data_y})
-            print( np.array(results[0]) )
+            paddle.enable_imperative()
+
+            x = paddle.imperative.to_variable(data_x)
+            y = paddle.imperative.to_variable(data_y)
+            input = paddle.imperative.to_variable(data_input)
+
+            out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
+
+            print( out.numpy() )
            # [[10.5 10.5]
            # [10.5 10.5]]
    """
+    input_shape = input.shape
+    x_shape = x.shape
+    y_shape = y.shape
+    if not len(input_shape) == len(x_shape) == len(y_shape) == 2:
+        raise ValueError("The dimention of input, x, y should be 2 but receive input's shape: {}, x's shape: {}, y's shape: {}".format(input_shape, x_shape, y_shape))
+    if input_shape[0] != x_shape[0]:
+        if input_shape[0] != 1:
+            raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
+        if input_shape[1] != y_shape[1] and input_shape[1] != 1:
+            raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
+    if input_shape[1] != y_shape[1]:
+        if input_shape[1] != 1:
+            raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
+        if input_shape[0] != x_shape[0] and input_shape[0] != 1:
+            raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
+    if x_shape[1] != y_shape[0]:
+        raise ValueError("The input Variable x's width must be equal with Variable y' height. But received x's shape = {}, y's shape = {}.".format(x_shape, y_shape))
+
+
+
    if in_dygraph_mode():
        out = core.ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta)
        return out
@@ -974,7 +992,7 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
    attrs = {'Alpha': alpha, 'Beta': beta}

    helper = LayerHelper("addmm", **locals())
-    check_variable_and_dtype(x, 'Input', ['float32', 'float64'], 'addmm')
+    check_variable_and_dtype(input, 'Input', ['float32', 'float64'], 'addmm')
    check_variable_and_dtype(x, 'X', ['float32', 'float64'], 'addmm')
    check_variable_and_dtype(y, 'Y', ['float32', 'float64'], 'addmm')
    out = helper.create_variable_for_type_inference(dtype=x.dtype)

--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -21,3 +21,4 @@ prettytable
 objgraph
 astor
 pathlib
+netifaces
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -152,6 +152,7 @@ packages=['paddle',
          'paddle.fleet.dataset',
          'paddle.fleet.metrics',
          'paddle.fleet.proto',
+          'paddle.fleet.utils',
          'paddle.framework',
          'paddle.fluid',
          'paddle.fluid.dygraph',