Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into trt_stack_op, test=develop

7be555ee · zlsh80826 · b59c6845 · 5258d53d · 7be555ee · 7be555ee
58 changed file
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -45,14 +45,35 @@ inline void InitVarsInScope(const std::vector<VarInfo> &var_infos, Scope *scope,
 // get CommContext and remote send and recv op
 void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
 #ifdef PADDLE_WITH_DISTRIBUTE
-  // init communicator here
-  auto *instance = operators::distributed::Communicator::GetInstance();
-  auto initialized = instance ? true : false;
-  PADDLE_ENFORCE_EQ(initialized, true,
-                    platform::errors::InvalidArgument(
-                        "Communicator is not Initialized, you may use "
-                        "FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
-                        "develop/markdown_doc/transpiler)"));
+
+  bool need_communicator = false;
+
+  for (auto &node : graphs[0]->Nodes()) {
+    VLOG(3) << "node name " << node->Name();
+    if (node && node->IsOp()) {
+      if (node->Name() == "send") {
+        auto send_varnames =
+            BOOST_GET_CONST(std::vector<std::string>,
+                            node->Op()->GetNullableAttr("send_varnames"));
+
+        if (send_varnames.size() > 0) {
+          need_communicator = true;
+          break;
+        }
+      }
+    }
+  }
+
+  if (need_communicator) {
+    // init communicator here
+    auto *instance = operators::distributed::Communicator::GetInstance();
+    auto initialized = instance ? true : false;
+    PADDLE_ENFORCE_EQ(initialized, true,
+                      platform::errors::InvalidArgument(
+                          "Communicator is not Initialized, you may use "
+                          "FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
+                          "develop/markdown_doc/transpiler)"));
+  }

 #endif
 }

--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -19,6 +19,6 @@ else()
    cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_GLOO)

-cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto)
+cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context)

 cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -54,10 +54,9 @@ void HdfsStore::set(const std::string& key, const std::vector<char>& data) {
      paddle::framework::fs_remove(tmp);
      if (i == retry_times_) {
        VLOG(0) << "fs_open_write failed, retry times reaches limit";
-        // PADDLE_THROW(platform::errors::PreconditionNotMet(
-        //    "fs_open_write failed, retry times reaches"
-        //    " limit ",
-        //    retry_times_));
+        PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+            "fs_open_write failed, retry times reaches %d limit.",
+            retry_times_));
      }
    } else {
      break;
@@ -143,9 +142,9 @@ void HdfsStore::wait(const std::vector<std::string>& keys,
          break;
        }
      }
-      // PADDLE_THROW(platform::errors::ExecutionTimeout(
-      VLOG(0) << "TIMEOUT self_rank = " << self_rank_
-              << " pair_rank = " << last_check_rank;
+      PADDLE_THROW(paddle::platform::errors::ExecutionTimeout(
+          "TIMEOUT self_rank = %d pair_rank = %d", self_rank_,
+          last_check_rank));
    }
    std::this_thread::sleep_for(std::chrono::milliseconds(wait_sleep_ms_));
  }

--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -28,38 +28,6 @@ DEFINE_bool(enable_unused_var_check, false,
            "Checking whether operator contains unused inputs, "
            "especially for grad operator. It should be in unittest.");

-// NOTE(zhiqiu): Currently, there are some operators which involves unused
-// inputs and cannot be removed from the allow_list below.
-// They can be mainly divided into four categories:
-// 0: the inputs of which are only used in if branch, or used in cuda kernel but
-// not in cpu kernel;
-// 1: the inputs of which are used to indicate dtype of outputs;
-// 2: the inputs of which are used in fused operators.
-// The category number is presented in the comments after each operator.
-
-const std::unordered_set<std::string> op_with_unsed_vars_allow_list = {
-    "batch_norm",                      // 0
-    "batch_norm_grad",                 // 0
-    "sync_batch_norm",                 // 0
-    "sync_batch_norm_grad",            // 0
-    "inplace_abn",                     // 0
-    "inplace_abn_grad",                // 0
-    "dgc_momentum",                    // 0
-    "fake_quantize_range_abs_max",     // 0
-    "rmsprop",                         // 0
-    "sequence_conv_grad",              // 0
-    "roi_perspective_transform_grad",  // 0
-    "fill_zeros_like",                 // 1
-    "fill_any_like",                   // 1
-    "nce_grad",                        // 1
-    "precision_recall",                // 1
-    "fusion_seqpool_cvm_concat",       // 2
-    "fused_batch_norm_act",            // 2
-    "fused_batch_norm_act_grad",       // 2
-    "data_norm",                       // 0
-    "data_norm_grad",                  // 0
-};
-
 namespace paddle {
 namespace framework {

@@ -75,9 +43,44 @@ void LogVarUsageIfUnusedVarCheckEnabled(const std::string &name) {
  }
 }

+static const std::unordered_set<std::string> &GetOpWithUnusedVarAllowSet() {
+  // NOTE(zhiqiu): Currently, there are some operators which involves unused
+  // inputs and cannot be removed from the allow_list below.
+  // They can be mainly divided into four categories:
+  // 0: the inputs of which are only used in if branch, or used in cuda kernel
+  // but not in cpu kernel; 1: the inputs of which are used to indicate dtype of
+  // outputs; 2: the inputs of which are used in fused operators. The category
+  // number is presented in the comments after each operator.
+  // Use pointer here for safe static deinitialization
+  static auto *allow_set = new std::unordered_set<std::string>({
+      // called once
+      "batch_norm",                      // 0
+      "batch_norm_grad",                 // 0
+      "sync_batch_norm",                 // 0
+      "sync_batch_norm_grad",            // 0
+      "inplace_abn",                     // 0
+      "inplace_abn_grad",                // 0
+      "dgc_momentum",                    // 0
+      "fake_quantize_range_abs_max",     // 0
+      "rmsprop",                         // 0
+      "sequence_conv_grad",              // 0
+      "roi_perspective_transform_grad",  // 0
+      "fill_zeros_like",                 // 1
+      "fill_any_like",                   // 1
+      "nce_grad",                        // 1
+      "precision_recall",                // 1
+      "fusion_seqpool_cvm_concat",       // 2
+      "fused_batch_norm_act",            // 2
+      "fused_batch_norm_act_grad",       // 2
+      "data_norm",                       // 0
+      "data_norm_grad",                  // 0);
+  });
+  return *allow_set;
+}
+
 void CheckUnusedVar(const OperatorBase &op, const Scope &scope) {
  // skip op in allow list.
-  if (op_with_unsed_vars_allow_list.count(op.Type()) != 0) {
+  if (GetOpWithUnusedVarAllowSet().count(op.Type()) != 0) {
    return;
  }
  auto *used_set = GetThreadLocalUsedVarNameSet();

--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -56,9 +56,11 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
  return static_cast<nvinfer1::IRuntime*>(
      dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
 }
-static nvinfer1::IPluginRegistry* getPluginRegistry() {
+#if IS_TRT_VERSION_GE(6000)
+static nvinfer1::IPluginRegistry* GetPluginRegistry() {
  return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
 }
+#endif

 // A logger for create TensorRT infer builder.
 class NaiveLogger : public nvinfer1::ILogger {

--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -178,12 +178,16 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
  std::string name_space_;
  std::string plugin_base_;
 };
-#endif

 template <typename T>
 class TrtPluginRegistrarV2 {
 public:
-  TrtPluginRegistrarV2() { getPluginRegistry()->registerCreator(creator, ""); }
+  TrtPluginRegistrarV2() {
+    static auto func_ptr = GetPluginRegistry();
+    if (func_ptr != nullptr) {
+      func_ptr->registerCreator(creator, "");
+    }
+  }

 private:
  T creator;
@@ -193,6 +197,8 @@ class TrtPluginRegistrarV2 {
  static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
      plugin_registrar_##name {}

+#endif
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference

--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -304,6 +304,7 @@ REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
 REGISTER_OP_CPU_KERNEL(
    squeeze, ops::SqueezeKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int>,
    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -311,12 +312,14 @@ REGISTER_OP_CPU_KERNEL(
    squeeze_grad,
    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
    squeeze2, ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, float>,
    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, bool>,
    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int>,
    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -324,6 +327,7 @@ REGISTER_OP_CPU_KERNEL(
    squeeze2_grad,
    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ b/paddle/fluid/operators/squeeze_op.cu.cc
@@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL(
    squeeze, ops::SqueezeKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int>,
    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -29,6 +30,7 @@ REGISTER_OP_CUDA_KERNEL(
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -36,6 +38,7 @@ REGISTER_OP_CUDA_KERNEL(
    squeeze2, ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, float>,
    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, double>,
    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, bool>,
    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int>,
    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -44,6 +47,7 @@ REGISTER_OP_CUDA_KERNEL(
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -36,26 +36,29 @@ extern void* tensorrt_dso_handle;
  struct DynLoad__##__name {                                                  \
    template <typename... Args>                                               \
    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
-      using tensorrt_func = decltype(&::__name);                              \
      std::call_once(tensorrt_dso_flag, []() {                                \
        tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
-        PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle,                          \
-                                platform::errors::Unavailable(                \
-                                    "Load tensorrt %s failed", #__name));     \
      });                                                                     \
      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);          \
-      PADDLE_ENFORCE_NOT_NULL(                                                \
-          p_##__name,                                                         \
-          platform::errors::Unavailable("Load tensorrt %s failed", #__name)); \
+      if (p_##__name == nullptr) {                                            \
+        return nullptr;                                                       \
+      }                                                                       \
+      using tensorrt_func = decltype(&::__name);                              \
      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
    }                                                                         \
  };                                                                          \
  extern DynLoad__##__name __name

+#if (NV_TENSORRT_MAJOR >= 6)
 #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
  __macro(createInferBuilder_INTERNAL);     \
  __macro(createInferRuntime_INTERNAL);     \
  __macro(getPluginRegistry);
+#else
+#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
+  __macro(createInferBuilder_INTERNAL);     \
+  __macro(createInferRuntime_INTERNAL);
+#endif

 TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)


--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -33,6 +33,7 @@ limitations under the License. */
 #include <curand.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
+
 #include "paddle/fluid/platform/cuda_error.pb.h"
 #endif  // PADDLE_WITH_CUDA

@@ -69,6 +70,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"

+DECLARE_int32(call_stack_level);
+
 namespace paddle {
 namespace platform {

@@ -226,9 +229,7 @@ inline std::string SimplifyDemangleStr(std::string str) {
  return str;
 }

-template <typename StrType>
-inline std::string GetTraceBackString(StrType&& what, const char* file,
-                                      int line) {
+inline std::string GetCurrentTraceBackString() {
  static constexpr int TRACE_STACK_LIMIT = 100;
  std::ostringstream sout;

@@ -256,6 +257,13 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
 #else
  sout << "Windows not support stack backtrace yet.\n";
 #endif
+  return sout.str();
+}
+
+template <typename StrType>
+inline std::string GetErrorSumaryString(StrType&& what, const char* file,
+                                        int line) {
+  std::ostringstream sout;
  sout << "\n----------------------\nError Message "
          "Summary:\n----------------------\n";
  sout << string::Sprintf("%s at (%s:%d)", std::forward<StrType>(what), file,
@@ -264,6 +272,17 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
  return sout.str();
 }

+template <typename StrType>
+inline std::string GetTraceBackString(StrType&& what, const char* file,
+                                      int line) {
+  if (FLAGS_call_stack_level > 1) {
+    // FLAGS_call_stack_level>1 means showing c++ call stack
+    return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line);
+  } else {
+    return GetErrorSumaryString(what, file, line);
+  }
+}
+
 inline bool is_error(bool stat) { return !stat; }

 inline void throw_on_error(bool stat, const std::string& msg) {
@@ -427,7 +446,7 @@ struct EnforceNotMet : public std::exception {
 *
 * Examples:
 *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
-*/
+ */
 #define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)                   \
  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {          \
    auto* __ptr = (__PTR);                                                  \
@@ -463,7 +482,7 @@ struct EnforceNotMet : public std::exception {
 *
 * Examples:
 *    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul");
-*/
+ */
 #define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE)                   \
  do {                                                                      \
    PADDLE_ENFORCE_EQ(__EXPR, true, paddle::platform::errors::NotFound(     \
@@ -491,7 +510,7 @@ struct EnforceNotMet : public std::exception {
 * Note: GCC 4.8 cannot select right overloaded function here, so need
 *    to define different functions and macros here, after we upgreade
 *    CI gcc version, we can only define one BOOST_GET macro.
-*/
+ */
 namespace details {

 #define DEFINE_SAFE_BOOST_GET(__InputType, __OutputType, __OutputTypePtr,      \

--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -483,3 +483,28 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0,  // MBytes
 * Note:
 */
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
+
+/**
+ * Debug related FLAG
+ * Name: FLAGS_call_stack_level
+ * Since Version: 2.0.0
+ * Value Range: int, default=2
+ * Example:
+ * Note: Used to debug. Determine the call stack to print when error or
+ * exeception happens.
+ * If FLAGS_call_stack_level == 0, only the error message summary will be shown.
+ * If FLAGS_call_stack_level == 1, the python stack and  error message summary
+ * will be shown.
+ * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
+ * message summary will be shown.
+ */
+DEFINE_int32(
+    call_stack_level, 2,
+    "Determine the call stack to print when error or exeception happens."
+    // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
+    // "If FLAGS_call_stack_level == 0, only the error message summary will be "
+    // "shown. "
+    "If FLAGS_call_stack_level == 1, the python stack and error message "
+    "summary will be shown."
+    "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
+    "error message summary will be shown.");
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/pybind/global_value_getter_setter.h"
+
 #include <cctype>
 #include <functional>
 #include <string>
@@ -20,6 +21,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -35,6 +37,7 @@ DECLARE_bool(cpu_deterministic);
 DECLARE_bool(enable_rpc_profiler);
 DECLARE_int32(multiple_of_cupti_buffer_size);
 DECLARE_bool(reader_queue_speed_test_mode);
+DECLARE_int32(call_stack_level);
 // device management
 DECLARE_int32(paddle_num_threads);
 // executor
@@ -337,14 +340,15 @@ static void RegisterGlobalVarGetterSetter() {
  REGISTER_PUBLIC_GLOBAL_VAR(
      FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
      FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
-      FLAGS_cpu_deterministic, FLAGS_enable_rpc_profiler,
-      FLAGS_multiple_of_cupti_buffer_size, FLAGS_reader_queue_speed_test_mode,
-      FLAGS_pe_profile_fname, FLAGS_print_sub_graph_dir,
-      FLAGS_fraction_of_cpu_memory_to_use, FLAGS_fuse_parameter_groups_size,
-      FLAGS_fuse_parameter_memory_size, FLAGS_init_allocated_mem,
-      FLAGS_initial_cpu_memory_in_mb, FLAGS_memory_fraction_of_eager_deletion,
-      FLAGS_use_pinned_memory, FLAGS_benchmark, FLAGS_inner_op_parallelism,
-      FLAGS_tracer_profile_fname, FLAGS_paddle_num_threads);
+      FLAGS_call_stack_level, FLAGS_cpu_deterministic,
+      FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
+      FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
+      FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
+      FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size,
+      FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
+      FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
+      FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
+      FLAGS_paddle_num_threads);

 #ifdef PADDLE_WITH_CUDA
  REGISTER_PUBLIC_GLOBAL_VAR(

--- a/python/paddle/fleet/base/role_maker.py
+++ b/python/paddle/fleet/base/role_maker.py
@@ -12,5 +12,523 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Defination of Role Makers."""
+import os
+import numpy as np
+from multiprocessing import Process, Manager
+import paddle.fluid as fluid

-# __all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+__all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+
+
+class Role:
+    WORKER = 1
+    SERVER = 2
+
+
+class RoleMakerBase(object):
+    """
+    RoleMakerBase is a base class for assigning a role to current process
+    in distributed training.
+    A paddle developer can implement RoleMakerBase to design a role maker
+    for worker or pserver assignment.
+    """
+
+    def __init__(self):
+        self._worker_endpoints = []
+        self._server_endpoints = []
+        self._role_is_generated = False
+        self._role = None
+        self._current_id = -1
+
+        self._node_type = None
+        self._node_type_comm = None
+        self._all_comm = None
+
+    def is_worker(self):
+        """
+        return is_worker() of current process
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def is_server(self):
+        """
+        return is_server() of current process
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def is_first_worker(self):
+        """
+        Check whether the node is the first instance of worker.
+        Returns:
+            bool: True if this is the first node of worker,
+                  False if not.
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def worker_num(self):
+        """
+        Get current total worker number.
+
+        Returns:
+            int: worker number
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def server_num(self):
+        """
+        Get current total server number.
+
+        Returns:
+            int: server number
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def worker_index(self):
+        """
+        Get current worker id.
+
+        Returns:
+            int: node id
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def server_index(self):
+        """
+        Get current server id.
+
+        Returns:
+            int: node id
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def role_id(self):
+        """
+        Get current id.
+
+        Returns:
+            int: node id
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def get_trainer_endpoints(self):
+        """
+        return trainer endpoints
+        """
+        return self._worker_endpoints
+
+    def get_pserver_endpoints(self):
+        """
+        return pserver endpoints
+        """
+        return self._server_endpoints
+
+    def to_string(self):
+        return "role: {}, current_id: {}, worker_endpoints: {}, server_endpoints: {}".format(
+            self._role, self._current_id, self._worker_endpoints,
+            self._server_endpoints)
+
+    def _all_gather(self, comm_world, input):
+        """
+
+        Args:
+            input(int|float): input value
+
+        Returns:
+            return a list of values
+        """
+        print("warning: RoleMakerBase does not have all gather.")
+        return None
+
+    def _all_reduce(self, comm_world, input, mode="sum"):
+        """
+        Args:
+            input(list/numpy.array): array of one dim
+            output(list/numpy.array): array of one dim
+            mode(str): "sum" or "min" or "max"
+        """
+        print("warning: RoleMakerBase does not have all reduce worker.")
+        return None
+
+    def _barrier(self, comm_world):
+        """
+        barrier between trainers if current role is TRAINER
+        """
+        print("warning: RoleMakerBase does not have barrier worker.")
+
+
+class PaddleCloudRoleMaker(RoleMakerBase):
+    def __init__(self, is_collective=False, init_gloo=True, **kwargs):
+        super(PaddleCloudRoleMaker, self).__init__()
+        self._is_collective = is_collective
+        self._init_gloo = init_gloo
+        self._kwargs = kwargs
+
+        self._role_is_generated = False
+
+        self._server_endpoints = None
+        self._worker_endpoints = None
+
+        self._node_type_comm = None
+        self._all_comm = None
+
+        if not self._is_collective:
+            self._hdfs_name = kwargs.get("hdfs_name", "")
+            self._hdfs_ugi = kwargs.get("hdfs_ugi", "")
+            self._hdfs_path = kwargs.get("path", "").rstrip("/")
+            self._init_timeout_seconds = kwargs.get("init_timeout_seconds",
+                                                    3600)
+            self._run_timeout_seconds = kwargs.get("run_timeout_seconds",
+                                                   9999999)
+            ip_port = kwargs.get("http_ip_port", "")
+            self._http_ip_port = []
+            self._http_server = None
+            # if ip_port is not empty, it will use http instead of hdfs
+            if ip_port != "":
+                self._http_ip_port = ip_port.split(":")
+                # it's for communication between processes
+                self._manager = Manager()
+                # global dict to store status
+                self._http_server_d = self._manager.dict()
+                # set running status of http server
+                self._http_server_d["running"] = False
+            self._iface = self.__get_default_iface()
+            # this environment variable can be empty
+            self._prefix = os.getenv("SYS_JOB_ID", "")
+
+    def _barrier(self, comm_world):
+        if comm_world:
+            comm_world.barrier()
+
+    def _all_gather(self, comm_world, input):
+        if comm_world:
+            self._barrier(comm_world)
+            output = comm_world.all_gather(input)
+            return output
+        else:
+            return None
+
+    def _all_reduce(self, comm_world, input, mode="sum"):
+        if not comm_world:
+            return None
+
+        input = np.array(input)
+
+        input_shape = input.shape
+        input_list = input.reshape(-1).tolist()
+
+        self._barrier(comm_world)
+        ans = comm_world.all_reduce(input_list, mode)
+        output = np.array(ans).reshape(input_shape)
+        return output
+
+    def is_worker(self):
+        """
+        whether current process is worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.WORKER
+
+    def is_server(self):
+        """
+        whether current process is server
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.SERVER
+
+    def is_first_worker(self):
+        """
+        whether current process is worker of rank 0
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.WORKER and self._current_id == 0
+
+    def worker_index(self):
+        """
+        get index of current worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._current_id
+
+    def server_index(self):
+        """
+        get index of current server
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._current_id
+
+    def role_id(self):
+        """
+        get index of current node
+        """
+        if self.is_server():
+            return self.server_index()
+        elif self.is_worker():
+            return self.worker_index()
+
+    def worker_num(self):
+        """
+        retrun the current number of worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._trainers_num
+
+    def server_num(self):
+        """
+        return the current number of server
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._trainers_num
+
+    def get_trainer_endpoints(self):
+        """
+        get endpoint of all trainers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._worker_endpoints
+
+    def get_pserver_endpoints(self):
+        """
+        get endpoint of all pservers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._server_endpoints
+
+    def _get_rank(self):
+        """
+        get current rank in all workers and pservers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._rank
+
+    def _get_size(self):
+        """
+        get total num of all workers and pservers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._size
+
+    def _ps_env(self):
+        try:
+            # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
+            # format: string(ip:port), eg. 127.0.0.1:6001
+            self._server_endpoints = os.environ[
+                "PADDLE_PSERVERS_IP_PORT_LIST"].split(",")
+            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
+                                               "").split(",")
+
+            trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
+            training_role = os.environ["TRAINING_ROLE"]
+
+            if training_role not in ["TRAINER", "PSERVER"]:
+                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
+
+            if training_role == "TRAINER":
+                role = Role.WORKER
+                current_id = int(os.environ["PADDLE_TRAINER_ID"])
+                if len(self._worker_endpoints) > 0:
+                    self._cur_endpoint = self._worker_endpoints[current_id]
+            elif training_role == "PSERVER":
+                role = Role.SERVER
+                port = os.environ["PADDLE_PORT"]
+                ip = os.environ["POD_IP"]
+                self._cur_endpoint = ip + ":" + port
+                current_id = self._server_endpoints.index(self._cur_endpoint)
+            else:
+                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
+        except ValueError as ve:
+            raise ValueError(
+                "something wrong with PaddleCloud, please check environment")
+
+        self._trainers_num = trainers_num
+        self._role = role
+        self._current_id = current_id
+
+    def _collective_env(self):
+        self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
+        assert (self._training_role == "TRAINER")
+        self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+        self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
+        self._worker_endpoints = self._worker_endpoints.split(",")
+        self._trainers_num = len(self._worker_endpoints)
+
+    def _init_gloo_env(self):
+        def init_gloo_instance(role="trainer"):
+            role = role.lower()
+            assert role in ["trainer", "pserver", "all"]
+            if role == "trainer":
+                all_list = self._worker_endpoints
+                rank = self._current_id
+            elif role == "pserver":
+                all_list = self._server_endpoints
+                rank = self._current_id
+            else:
+                all_list = self._worker_endpoints + self._server_endpoints
+                rank = all_list.index(self._cur_endpoint)
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(len(all_list))
+            gloo.set_prefix(self._prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            if len(self._http_ip_port) != 0:
+                gloo.set_http_store(self._http_ip_port[0],
+                                    int(self._http_ip_port[1]), role)
+            else:
+                gloo.set_hdfs_store(self._hdfs_path + "/" + role,
+                                    self._hdfs_name, self._hdfs_ugi)
+            gloo.init()
+            return gloo
+
+        # paddlecloud support gloo
+        if self._role == Role.WORKER:
+            if self._current_id == 0 and len(self._http_ip_port) != 0:
+                size_d = {
+                    "trainer": len(self._worker_endpoints),
+                    "pserver": len(self._server_endpoints),
+                    "all":
+                    len(self._worker_endpoints) + len(self._server_endpoints)
+                }
+                # child process for http server
+                self._http_server = Process(
+                    target=self.__start_kv_server,
+                    args=(self._http_server_d, size_d))
+                self._http_server.daemon = True
+                # set running status to True
+                self._http_server_d["running"] = True
+                # start child process
+                self._http_server.start()
+            self._node_type = 1
+            gloo = init_gloo_instance("trainer")
+            self._node_type_comm = gloo
+        else:
+            assert self._role == Role.SERVER
+            self._node_type = 0
+            gloo = init_gloo_instance("pserver")
+            self._node_type_comm = gloo
+
+        all_list = self._worker_endpoints + self._server_endpoints
+        self._rank = all_list.index(self._cur_endpoint)
+        self._size = len(all_list)
+
+        gloo = init_gloo_instance("all")
+        self._all_comm = gloo
+
+        if self._http_server is not None:
+            # set running status to False
+            self._http_server_d["running"] = False
+            # wait until child process exits
+            self._http_server.join()
+
+    def generate_role(self):
+        """
+        generate role for role maker
+        """
+        if not self._role_is_generated:
+            if not self._is_collective:
+                self._ps_env()
+                if self._init_gloo:
+                    self._init_gloo_env()
+            else:
+                self._collective_env()
+            self._role_is_generated = True
+
+    def __get_default_iface(self):
+        """
+        get default physical interface
+        """
+        default1 = self.__get_default_iface_from_gateway()
+        default2 = self.__get_default_iface_from_interfaces()
+        return default2 if default1 == "lo" else default1
+
+    def __get_default_iface_from_gateway(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        gateways = netifaces.gateways()
+        if gateways.get(netifaces.AF_INET) != None:
+            gateway = gateways[netifaces.AF_INET]
+            if len(gateway) > 0 and len(gateway[0]) > 1:
+                return gateway[0][1]
+        return "lo"
+
+    def __get_default_iface_from_interfaces(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        for intf_name in netifaces.interfaces():
+            addresses = netifaces.ifaddresses(intf_name)
+            if netifaces.AF_INET in addresses:
+                ipv4_addresses = addresses[netifaces.AF_INET]
+                for ipv4_address in ipv4_addresses:
+                    if 'broadcast' in ipv4_address:
+                        return intf_name
+        return "lo"
+
+    def __start_kv_server(self, http_server_d, size_d):
+        from paddle.fleet.utils import KVServer
+        http_server = KVServer(int(self._http_ip_port[1]), size_d)
+        http_server.start()
+        wait_seconds = 5
+        while http_server_d.get("running",
+                                False) and not http_server.shoud_stop():
+            time.sleep(wait_seconds)
+        http_server.stop()
+
+
+class UserDefinedRoleMaker(PaddleCloudRoleMaker):
+    def __init__(self, is_collective=False, init_gloo=False, **kwargs):
+        super(UserDefinedRoleMaker, self).__init__(
+            is_collective=is_collective, init_gloo=init_gloo, **kwargs)
+
+    def _user_defined_ps_env(self):
+        self._server_endpoints = self._kwargs.get("server_endpoints")
+        self._worker_endpoints = self._kwargs.get("worker_endpoints", [])
+        self._trainers_num = self._kwargs.get("worker_num", 0)
+
+        if self._trainers_num == 0:
+            assert (len(self._worker_endpoints) > 0)
+            self._trainers_num = len(self._worker_endpoints)
+
+        self._role = self._kwargs.get("role")
+        self._current_id = self._kwargs.get("current_id")
+
+        if self._role == Role.WORKER and len(
+                self._worker_endpoints) > self._current_id:
+            self._cur_endpoint = self._worker_endpoints[self._current_id]
+        elif self._role == Role.SERVER:
+            self._cur_endpoint = self._server_endpoints[self._current_id]
+
+    def _user_defined_collective_env(self):
+        self._worker_endpoints = self._kwargs.get("worker_endpoints")
+        self._current_id = self._kwargs.get("current_id")
+        self._trainers_num = len(self._worker_endpoints)
+        self._training_role = Role.Worker
+
+    def generate_role(self):
+        """
+        generate role for role maker
+        """
+        if not self._role_is_generated:
+            if not self._is_collective:
+                self._user_defined_ps_env()
+                if self._init_gloo:
+                    self._init_gloo_env()
+            else:
+                self._user_defined_collective_env()
+            self._role_is_generated = True
--- a/python/paddle/fleet/base/util_factory.py
+++ b/python/paddle/fleet/base/util_factory.py
@@ -18,12 +18,27 @@

 __all__ = ['UtilBase']

+import numpy as np
+import os
+
+import subprocess
+from paddle.fluid import core
+from collections import OrderedDict
+import paddle.fluid as fluid
+from google.protobuf import text_format
+from paddle.fluid import debugger
+from paddle.fluid.framework import Program
+from paddle.fluid.proto import framework_pb2
+from ..utils.fs import FS, LocalFS, HDFSClient
+

 class UtilFactory(object):
-    def _create_util(self, context):
+    def _create_util(self, context=None):
        util = UtilBase()
-        util._set_strategy(context["valid_strategy"])
-        util._set_role_maker(context["role_maker"])
+        if context is not None and "valid_strategy" in context:
+            util._set_strategy(context["valid_strategy"])
+        if context is not None and "role_maker" in context:
+            util._set_role_maker(context["role_maker"])
        return util


@@ -38,43 +53,390 @@ class UtilBase(object):
    def _set_role_maker(self, role_maker):
        self.role_maker = role_maker

-    '''
    def set_file_system(self, fs_client):
+        assert isinstance(
+            fs_client,
+            FS), "fs_client must be the instance of paddle.fleet.utils.FS"
        self.fs_client = fs_client

-    def broadcast(self):
-        pass
+    def __check_comm_world(self, comm_world="worker"):
+        if not self.role_maker._role_is_generated:
+            self.role_maker.generate_role()

-    def all_gather(self):
-        pass
+        _comm_world = None
+        comm_world_upper = comm_world.upper()
+        if comm_world_upper == "WORKER":
+            if not self.role_maker.is_worker():
+                print(
+                    "warning: current role is not worker in collective_func(comm_world=\"worker\")"
+                )
+            _comm_world = self.role_maker._node_type_comm
+        elif comm_world_upper == "SERVER":
+            if not self.role_maker.is_server():
+                print(
+                    "warning: current role is not server in collective_func(comm_world=\"server\")"
+                )
+            _comm_world = self.role_maker._node_type_comm
+        elif comm_world_upper == "ALL":
+            _comm_world = self.role_maker._all_comm
+        else:
+            raise ValueError(
+                "not support comm_world, please choose one from [worker, server, all]"
+            )

-    def all_reduce(self):
-        pass
+        return _comm_world

-    def reduce_scatter(self):
+    def all_reduce(self, input, mode, comm_world="worker"):
+        _comm_world = self.__check_comm_world(comm_world)
+        return self.role_maker._all_reduce(_comm_world, input, mode)
+
+    def barrier(self, comm_world="worker"):
+        _comm_world = self.__check_comm_world(comm_world)
+        self.role_maker._barrier(_comm_world)
+
+    def all_gather(self, input, comm_world="worker"):
+        _comm_world = self.__check_comm_world(comm_world)
+        return self.role_maker._all_gather(_comm_world, input)
+
+    def broadcast(self):
        pass

-    def reduce(self):
+    def scatter(self):
        pass

    def get_file_shard(self, files):
-        pass
+        """
+        split files before distributed training,
+        example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
+                   0 gets [a, b, c] and trainer 1 gets [d, e].
+        example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
+                   [a], trainer 1 gets [b],  trainer 2 gets []

-    def feed_gen(self, batch_size, feed_vars_dims, feeded_vars_filelist):
-        pass
+        Args:
+            files(list): file list need to be read.

-    def save_program(program, output_dir):
-        pass
+        Returns:
+            list: files belongs to this worker.
+        """
+        if not isinstance(files, list):
+            raise TypeError("files should be a list of file need to be read.")

-    def load_program(input_dir):
-        pass
+        trainer_id = self.role_maker.worker_index()
+        trainers = self.role_maker.worker_num()

-    def load_var():
-        pass
+        remainder = len(files) % trainers
+        blocksize = int(len(files) / trainers)

-    def save_var():
-        pass
+        blocks = [blocksize] * trainers
+        for i in range(remainder):
+            blocks[i] += 1

-    def print_on_rank(self):
-        pass
-    '''
+        trainer_files = [[]] * trainers
+        begin = 0
+        for i in range(trainers):
+            trainer_files[i] = files[begin:begin + blocks[i]]
+            begin += blocks[i]
+
+        return trainer_files[trainer_id]
+
+    def print_on_rank(self, message, rank_id):
+        if self.role_maker.worker_index() != rank_id:
+            return
+        print(message)
+
+    def _save_program(self, program, model_filename='__model__', is_text=False):
+        if is_text:
+            with open(model_filename, "w") as f:
+                f.write(str(program))
+        else:
+            with open(model_filename, "wb") as f:
+                f.write(program.desc.serialize_to_string())
+
+    def _load_program(self, path, is_text):
+        def load_program_binary(path):
+            """load program from binary string file"""
+            with open(path, "rb") as f:
+                program_desc_str = f.read()
+            return Program.parse_from_string(program_desc_str)
+
+        def load_program_text(path):
+            """load program from human-readable text file"""
+            with open(path, "r") as f:
+                program_desc_text = f.read()
+
+            prog_desc = framework_pb2.ProgramDesc()
+            text_format.Merge(program_desc_text, prog_desc)
+            return Program.parse_from_string(prog_desc.SerializeToString())
+
+        if is_text:
+            return load_program_text(path)
+        else:
+            return load_program_binary(path)
+
+    def _program_type_trans(self, prog_dir, prog_fn, is_text):
+        prog = self._load_program(os.path.join(prog_dir, prog_fn), is_text)
+        prog_out_fn = prog_fn + ".bin" if is_text else prog_fn + ".pbtxt"
+        self._save_program(prog,
+                           os.path.join(prog_dir, prog_out_fn), 1 - is_text)
+        return prog_out_fn
+
+    def _visualize_graphviz(self, program, output_dir, output_filename):
+        block = program.global_block()
+        dot_path = os.path.join(output_dir, output_filename + '.dot')
+        pdf_path = os.path.join(output_dir, output_filename + '.pdf')
+        debugger.draw_block_graphviz(block, path=dot_path)
+        cmd = ["dot", "-Tpdf", dot_path, "-o", pdf_path]
+        p = subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        p.wait()
+
+    def _proto_check(self, config):
+        train_prog = self._load_program(config.train_prog_path,
+                                        config.is_text_train_program)
+        pruned_prog = self._load_program(config.pruned_prog_path,
+                                         config.is_text_pruned_program)
+
+        is_match = True
+
+        pruned_vars = [(v.name, v) for v in pruned_prog.list_vars()
+                       if fluid.io.is_persistable(v)]
+        pruned_vars = OrderedDict(pruned_vars)
+        pruned_vars_name = [name for name in pruned_vars]
+        print("persistable vars in pruned program: {}".format(pruned_vars_name))
+
+        # feed and fetch op is added in pruned program when pruning, not need to be found in train program
+        feed_fetch_type_list = [
+            core.VarDesc.VarType.FEED_MINIBATCH, core.VarDesc.VarType.FETCH_LIST
+        ]
+
+        for var_name in pruned_vars:
+            var = pruned_vars[var_name]
+            # feed and fetch op is added in pruned program when pruning, not need to be found in train program
+            if var.type in feed_fetch_type_list:
+                break
+            try:
+                train_prog_var = train_prog.global_block().var(var_name)
+            except ValueError as e:
+                print(
+                    "Not find variable '%s' in train program. please check pruning."
+                    % var_name)
+                is_match = False
+                continue
+            if var.shape != train_prog_var.shape or var.dtype != train_prog_var.dtype:
+                print(
+                    "variable: {} not match. in pruned program shape: {} dtype:{}, in train program shape: {} dtype: {}".
+                    format(var_name, var.shape, var.dtype, train_prog_var.shape,
+                           train_prog_var.dtype))
+                is_match = False
+        return is_match
+
+    def _params_check(self, config):
+        def feed_gen(batch_size, feeded_vars_dims, feeded_vars_filelist):
+            def reader(batch_size, fn, dim):
+                data = []
+                if isinstance(dim, list) or isinstance(dim, tuple):
+                    shape = list(dim)
+                    _temp = 1
+                    for x in dim:
+                        _temp = _temp * x
+                    dim = _temp
+                else:
+                    shape = [dim]
+
+                shape = [batch_size] + shape
+                dim = dim * batch_size
+
+                for line in open(fn, 'r'):
+                    fields = line.strip().split(' ')
+                    fields = [float(d) for d in fields]
+                    while len(fields) >= dim:
+                        tmp = fields[:dim]
+                        fields = fields[dim:]
+                        data.append(np.array(tmp).reshape(shape))
+                return data
+
+            batch_feed = []
+            for i, fn in enumerate(feeded_vars_filelist):
+                batch_feed.append(reader(batch_size, fn, feeded_vars_dims[i]))
+            return batch_feed
+
+        prog = self._load_program(
+            os.path.join(config.dump_model_dir, config.dump_program_filename),
+            config.is_text_dump_program)
+        if config.is_text_dump_program:
+            model_filename = self._program_type_trans(
+                config.dump_model_dir, config.dump_program_filename,
+                config.is_text_dump_program)
+
+        saved_params = [
+            v for v in prog.list_vars() if fluid.io.is_persistable(v)
+        ]
+        print("persistable vars in dump program: {}".format(
+            [v.name for v in saved_params]))
+
+        def check_not_expected_ops(prog, not_expected_op_types):
+            op_types_set = set()
+            for op in prog.global_block().ops:
+                if op.type in not_expected_op_types and op.type not in op_types_set:
+                    op_types_set.add(op.type)
+            return op_types_set
+
+        not_expected_op_types = check_not_expected_ops(prog, ["lookup_table"])
+        if len(not_expected_op_types) > 0:
+            print(
+                "find op type '{}' in program, please check if your program is pruned correctly !".
+                format(list(not_expected_op_types)))
+            return False
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            inference_program, feed_target_names, fetch_targets = \
+                fluid.io.load_inference_model(config.dump_model_dir, exe, model_filename=model_filename,
+                                            params_filename=config.save_params_filename)
+
+            # check program vars and saved vars shape
+            orig_para_shape = {
+                each_var.name: tuple(each_var.desc.shape())
+                for each_var in saved_params
+            }
+            for each_var in saved_params:
+                var_temp = fluid.global_scope().find_var(each_var.name)
+                assert var_temp != None, "can't not find var: " + each_var.name
+                new_shape = (np.array(var_temp.get_tensor())).shape
+                assert each_var.name in orig_para_shape, each_var.name + "MUST in var list"
+                orig_shape = orig_para_shape.get(each_var.name)
+                if new_shape != orig_shape:
+                    raise RuntimeError(
+                        "Shape not matching: the Program requires a parameter with a shape of ({}), "
+                        "while the loaded parameter (namely [ {} ]) has a shape of  ({}).".
+                        format(orig_shape, each_var.name, new_shape))
+
+            # check feed/fetch vars in program and config
+            feed_config = config.feed_config
+            fetch_config = config.fetch_config
+            fetch_targets_names = [v.name for v in fetch_targets]
+            if not feed_target_names:
+                print("warning! no feed targets in program.")
+            if not fetch_targets_names:
+                print("warning! no fetch targets in program.")
+            fetch_list = fetch_targets
+            feed_name_list = feed_target_names
+            if feed_config.feeded_vars_names is not None and feed_target_names != feed_config.feeded_vars_names:
+                print(
+                    "warning! feed vars in program and config are diff: feed in program: {}. feed in config {}.".
+                    format(feed_target_names, feed_config.feeded_vars_names))
+                feed_name_list = feed_config.feeded_vars_names
+                # remove feed op in inference_program. new feed op will be added in exe.run
+                global_block = inference_program.global_block()
+                need_to_remove_op_index = []
+                for i, op in enumerate(global_block.ops):
+                    op.desc.set_is_target(False)
+                    if op.type == "feed":  # only remove feed op here
+                        need_to_remove_op_index.append(i)
+                for index in need_to_remove_op_index[::-1]:
+                    global_block._remove_op(index)
+            if fetch_config.fetch_vars_names is not None and fetch_targets_names != fetch_config.fetch_vars_names:
+                print(
+                    "warning! fetch vars in program and config are diff: fetch in program: {}. fetch in config {}.".
+                    format(fetch_targets_names, fetch_config.fetch_vars_names))
+                fetch_list = [
+                    inference_program.global_block().var(i)
+                    for i in fetch_config.fetch_vars_names
+                ]
+                # remove fetch op in inference_program. new fetch op will be added in exe.run
+                global_block = inference_program.global_block()
+                need_to_remove_op_index = []
+                for i, op in enumerate(global_block.ops):
+                    op.desc.set_is_target(False)
+                    if op.type == "fetch":  # only remove fetch op here
+                        need_to_remove_op_index.append(i)
+                for index in need_to_remove_op_index[::-1]:
+                    global_block._remove_op(index)
+
+            # if fetch_list have lod tensor
+            return_numpy = all([v.lod_level == 0 for v in fetch_list])
+
+            # try dump fetch_targets
+            feed_tensors = []
+            assert len(feed_config.feeded_vars_names) == len(
+                feed_config.feeded_vars_dims) == len(
+                    feed_config.feeded_vars_types)
+            # check program vars and feed tensor shape in config
+            for i in range(len(feed_config.feeded_vars_names)):
+                var = inference_program.global_block().var(
+                    feed_config.feeded_vars_names[i])
+                if not isinstance(feed_config.feeded_vars_dims[i],
+                                  (list, tuple)):
+                    tensor_shape = (feed_config.feeded_vars_dims[i], )
+                else:
+                    tensor_shape = tuple(feed_config.feeded_vars_dims[i])
+                feed_config.feeded_vars_dims[i] = tensor_shape
+                var_shape = var.shape[1:]
+                if tensor_shape != var_shape:
+                    raise RuntimeError(
+                        "feed variable '{}' shape not match. infer program  shape: {}. feed tensor shape: {}".
+                        format(feed_config.feeded_vars_names[i], var_shape,
+                               tensor_shape))
+
+            if not feed_config.feeded_vars_filelist:
+                print("generate random feed vars.")
+                for i in range(len(feed_config.feeded_vars_names)):
+                    var = inference_program.global_block().var(
+                        feed_config.feeded_vars_names[i])
+                    # create fake feed tensor. if lod_level > 1, should create_lod_tensor()
+                    if var.lod_level == 0:
+                        feed_tensors.append(
+                            np.array(
+                                np.random.random(
+                                    tuple([config.batch_size] + list(
+                                        feed_config.feeded_vars_dims[i]))),
+                                dtype=feed_config.feeded_vars_types[i]))
+                    elif var.lod_level == 1:
+                        t = np.array(
+                            np.random.random(
+                                tuple([config.batch_size] + list(
+                                    feed_config.feeded_vars_dims[i]))),
+                            dtype=feed_config.feeded_vars_types[i])
+                        feed_tensors.append(
+                            fluid.create_lod_tensor(t, [[1] * config.batch_size
+                                                        ], place))
+                    else:
+                        raise RuntimeError(
+                            "vars with lod_level >= 2 is not supported now in this infer program check tool."
+                        )
+                results = exe.run(inference_program,
+                                  feed={
+                                      name: feed_tensors[i]
+                                      for i, name in enumerate(feed_name_list)
+                                  },
+                                  fetch_list=fetch_list,
+                                  return_numpy=return_numpy)
+            else:
+                print("load feed vars from files: {}.".format(
+                    feed_config.feeded_vars_filelist))
+                feed_vars = [
+                    inference_program.global_block().var(
+                        feed_config.feeded_vars_names[i])
+                    for i in range(len(feed_config.feeded_vars_names))
+                ]
+                feeder = fluid.DataFeeder(feed_list=feed_vars, place=place)
+                batch_feed = feed_gen(config.batch_size,
+                                      feed_config.feeded_vars_dims,
+                                      feed_config.feeded_vars_filelist)
+                slots = [batch_feed]
+                results = exe.run(inference_program,
+                                  feed=feeder.feed(slots),
+                                  fetch_list=fetch_list,
+                                  return_numpy=return_numpy)
+            for i, v in enumerate(fetch_list):
+                print("fetch_targets name: %s" % v.name)
+                print("fetch_targets: {}".format(results[i]))
+            return results
+
+
+fleet_util = UtilFactory()._create_util(None)
--- a/python/paddle/fleet/utils/__init__.py
+++ b/python/paddle/fleet/utils/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .fs import *
+from .http_server import KVHandler, KVHTTPServer, KVServer
+
+__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
--- a/python/paddle/fleet/utils/fs.py
+++ b/python/paddle/fleet/utils/fs.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import subprocess
+import multiprocessing
+from datetime import datetime
+
+import re
+import copy
+import errno
+import time
+import logging
+import six
+import abc
+import paddle.fluid as fluid
+import functools
+
+from pathlib import PurePosixPath, Path
+import shutil
+
+__all__ = [
+    'FS', 'LocalFS', 'HDFSClient', 'ExecuteError', 'FSTimeOut',
+    'FSFileExistsError', 'FSFileNotExistsError'
+]
+
+
+class ExecuteError(Exception):
+    pass
+
+
+class FSFileExistsError(Exception):
+    pass
+
+
+class FSFileNotExistsError(Exception):
+    pass
+
+
+class FSTimeOut(Exception):
+    pass
+
+
+class FS(object):
+    @abc.abstractmethod
+    def ls_dir(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_file(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_dir(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_exist(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def upload(self, local_path, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def download(self, fs_path, local_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def mkdirs(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def delete(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def need_upload_download(self):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def rename(self, fs_src_path, fs_dst_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def mv(self, fs_src_path, fs_dst_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def upload_dir(self, local_dir, dest_dir):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def glob(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def stat(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def walk(self, fs_path):
+        raise NotImplementedError
+
+
+class LocalFS(FS):
+    def ls_dir(self, fs_path):
+        if not self.is_exist(fs_path):
+            return [], []
+
+        dirs = []
+        files = []
+        for f in os.listdir(fs_path):
+            if os.path.isdir(fs_path + "/" + f):
+                dirs.append(f)
+            else:
+                files.append(f)
+
+        return dirs, files
+
+    def mkdirs(self, fs_path):
+        assert not os.path.isfile(fs_path), "{} is already a file".format(
+            fs_path)
+        os.system("mkdir -p {}".format(fs_path))
+
+    def is_file(self, fs_path):
+        return os.path.isfile(fs_path)
+
+    def is_dir(self, fs_path):
+        return os.path.isdir(fs_path)
+
+    def is_exist(self, fs_path):
+        return os.path.exists(fs_path)
+
+    def _rmr(self, fs_path):
+        shutil.rmtree(fs_path)
+
+    def _rm(self, fs_path):
+        os.remove(fs_path)
+
+    def delete(self, fs_path):
+        if not self.is_exist(fs_path):
+            return
+
+        if os.path.isfile(fs_path):
+            return self._rm(fs_path)
+
+        return self._rmr(fs_path)
+
+    def rename(self, fs_src_path, fs_dst_path):
+        os.rename(fs_src_path, fs_dst_path)
+
+    def need_upload_download(self):
+        return False
+
+    def touch(self, fs_path):
+        return Path(fs_path).touch()
+
+    def mv(self, src_path, dst_path):
+        if not self.is_exist(src_path):
+            raise FSFileNotExistsError
+
+        if self.is_exist(dst_path):
+            raise FSFileExistsError
+
+        return self.rename(src_path, dst_path)
+
+
+"""HDFS Utils."""
+
+
+def _handle_errors(f):
+    def handler(*args, **kwargs):
+        start = time.time()
+        while True:
+            try:
+                return f(*args, **kwargs)
+            except ExecuteError as e:
+                o = args[0]
+                time_out = float(o._time_out) / 1000.0
+                inter = float(o._sleep_inter) / 1000.0
+                if time.time() - start >= time_out:
+                    raise FSTimeOut
+                time.sleep(inter)
+
+    return functools.wraps(f)(handler)
+
+
+class HDFSClient(FS):
+    def __init__(
+            self,
+            hadoop_home,
+            configs,
+            time_out=5 * 60 * 1000,  #ms
+            sleep_inter=1000):  #ms
+        # Raise exception if JAVA_HOME not exists.
+        java_home = os.environ["JAVA_HOME"]
+
+        self.pre_commands = []
+        hadoop_bin = '%s/bin/hadoop' % hadoop_home
+        self.pre_commands.append(hadoop_bin)
+        dfs = 'fs'
+        self.pre_commands.append(dfs)
+
+        if configs:
+            for k, v in six.iteritems(configs):
+                self.pre_commands.append('-D%s=%s' % (k, v))
+
+        self._time_out = time_out
+        self._sleep_inter = sleep_inter
+        self._base_cmd = " ".join(self.pre_commands)
+        self._bd_err_re = re.compile(
+            r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:')
+
+    def _run_cmd(self, cmd, redirect_stderr=False):
+        ret, output = fluid.core.shell_execute_cmd(cmd, 0, 0, redirect_stderr)
+        return int(ret), output.splitlines()
+
+    @_handle_errors
+    def ls_dir(self, fs_path):
+        """	
+        list directory under fs_path, and only give the pure name, not include the fs_path	
+        """
+        if not self.is_exist(fs_path):
+            return [], []
+
+        cmd = "{} -ls {}".format(self._base_cmd, fs_path)
+        ret, lines = self._run_cmd(cmd)
+
+        if ret != 0:
+            raise ExecuteError
+
+        dirs = []
+        files = []
+        for line in lines:
+            arr = line.split()
+            if len(arr) != 8:
+                continue
+
+            if fs_path not in arr[7]:
+                continue
+
+            p = PurePosixPath(arr[7])
+            if arr[0][0] == 'd':
+                dirs.append(p.name)
+            else:
+                files.append(p.name)
+
+        return dirs, files
+
+    def _test_match(self, lines):
+        for l in lines:
+            m = self._bd_err_re.match(l)
+            if m != None:
+                return m
+
+        return None
+
+    @_handle_errors
+    def is_dir(self, fs_path):
+        if not self.is_exist(fs_path):
+            return False
+
+        cmd = "{} -test -d {}".format(
+            self._base_cmd, fs_path, redirect_stderr=True)
+        ret, lines = self._run_cmd(cmd)
+        if ret:
+            # other error
+            if self._test_match(lines) != None:
+                raise ExecuteError
+
+            return False
+
+        return True
+
+    def is_file(self, fs_path):
+        if not self.is_exist(fs_path):
+            return False
+
+        return not self.is_dir(fs_path)
+
+    @_handle_errors
+    def is_exist(self, fs_path):
+        cmd = "{} -ls {} ".format(self._base_cmd, fs_path)
+        ret, out = self._run_cmd(cmd, redirect_stderr=True)
+        if ret != 0:
+            for l in out:
+                if "No such file or directory" in l:
+                    return False
+            raise ExecuteError
+
+        return True
+
+    @_handle_errors
+    def upload(self, local_path, fs_path):
+        if self.is_exist(fs_path):
+            raise FSFileExistsError
+
+        local = LocalFS()
+        if not local.is_exist(local_path):
+            raise FSFileNotExistsError
+
+        cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
+        ret, lines = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def download(self, fs_path, local_path):
+        if self.is_exist(local_path):
+            raise FSFileExistsError
+
+        if not self.is_exist(fs_path):
+            raise FSFileNotExistsError
+
+        cmd = "{} -get {} {}".format(self._base_cmd, fs_path, local_path)
+        ret, lines = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def mkdirs(self, fs_path):
+        if self.is_exist(fs_path):
+            return
+
+        cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
+        ret, lines = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def mv(self, fs_src_path, fs_dst_path, test_exists=True):
+        if test_exists:
+            if not self.is_exist(fs_src_path):
+                raise FSFileNotExistsError
+
+            if self.is_exist(fs_dst_path):
+                raise FSFileExistsError
+
+        cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def _rmr(self, fs_path):
+        cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def _rm(self, fs_path):
+        cmd = "{} -rm {}".format(self._base_cmd, fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    def delete(self, fs_path):
+        if not self.is_exist(fs_path):
+            return
+
+        is_dir = self.is_dir(fs_path)
+        if is_dir:
+            return self._rmr(fs_path)
+
+        return self._rm(fs_path)
+
+    def need_upload_download(self):
+        return True
--- a/python/paddle/fleet/utils/http_server.py
+++ b/python/paddle/fleet/utils/http_server.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Http Server."""
+
+import logging
+
+import six
+# NOTE: HTTPServer has a different name in python2 and python3
+if six.PY2:
+    from BaseHTTPServer import HTTPServer
+    import SimpleHTTPServer
+else:
+    from http.server import HTTPServer
+    import http.server as SimpleHTTPServer
+
+import time
+import threading
+import socket
+
+
+def get_logger(name, level, fmt):
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    handler = logging.FileHandler('http.log', mode='w')
+    formatter = logging.Formatter(fmt=fmt)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    return logger
+
+
+_http_server_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+    """
+    kv handler class for kv http server,
+    it defines the way to get/set kv in server.
+    """
+
+    def do_GET(self):
+        """
+        get method for kv handler, get value according to key.
+        """
+        log_str = "GET " + self.address_string() + self.path
+        paths = self.path.split('/')
+        if len(paths) < 3:
+            print('len of request path must be 3: ' + self.path)
+            self.send_status_code(400)
+            return
+        _, scope, key = paths
+        with self.server.kv_lock:
+            value = self.server.kv.get(scope, {}).get(key)
+        if value is None:
+            log_str += ' , key not found: ' + key
+            self.send_status_code(404)
+        else:
+            log_str += ' , key found: ' + key
+            self.send_response(200)
+            self.send_header("Content-Length", str(len(value)))
+            self.end_headers()
+            self.wfile.write(value)
+        _http_server_logger.info(log_str)
+
+    def do_PUT(self):
+        """
+        put method for kv handler, set value according to key.
+        """
+        log_str = "PUT " + self.address_string() + self.path
+        paths = self.path.split('/')
+        if len(paths) < 3:
+            print('len of request path must be 3: ' + self.path)
+            self.send_status_code(400)
+            return
+        _, scope, key = paths
+        content_length = int(self.headers['Content-Length'])
+        try:
+            value = self.rfile.read(content_length)
+        except:
+            print("receive error invalid request")
+            self.send_status_code(404)
+            return
+        with self.server.kv_lock:
+            if self.server.kv.get(scope) is None:
+                self.server.kv[scope] = {}
+            self.server.kv[scope][key] = value
+        self.send_status_code(200)
+        _http_server_logger.info(log_str)
+
+    def do_DELETE(self):
+        """
+        delete method for kv handler, set value according to key.
+        """
+        log_str = "DELETE " + self.address_string() + self.path
+        paths = self.path.split('/')
+        if len(paths) < 3:
+            print('len of request path must be 3: ' + self.path)
+            self.send_status_code(400)
+            return
+        _, scope, key = paths
+        with self.server.delete_kv_lock:
+            if self.server.delete_kv.get(scope) is None:
+                self.server.delete_kv[scope] = []
+            self.server.delete_kv[scope].append(key)
+        self.send_status_code(200)
+        _http_server_logger.info(log_str)
+
+    def log_message(self, format, *args):
+        """
+        ignore all logging messages in kv handler.
+        """
+        pass
+
+    def send_status_code(self, code):
+        """
+        send status code back to client.
+        """
+        self.send_response(code)
+        self.send_header("Content-Length", 0)
+        self.end_headers()
+
+
+class KVHTTPServer(HTTPServer, object):
+    """
+    it is a http server storing kv pairs.
+    """
+
+    def __init__(self, port, handler):
+        """Init."""
+        super(KVHTTPServer, self).__init__(('', port), handler)
+        self.delete_kv_lock = threading.Lock()
+        self.delete_kv = {}
+        self.kv_lock = threading.Lock()
+        self.kv = {}
+
+    def get_deleted_size(self, key):
+        """
+        get deleted size in key.
+        """
+        ret = 0
+        with self.delete_kv_lock:
+            ret = self.delete_kv.get(key, 0)
+        return ret
+
+
+class KVServer:
+    """
+    it is a server storing kv pairs, has a http server inside.
+    """
+
+    def __init__(self, port, size={}):
+        """Init."""
+        self.http_server = KVHTTPServer(port, KVHandler)
+        self.listen_thread = None
+        self.size = {}
+
+    def start(self):
+        """
+        start server until user calls stop to let it quit.
+        """
+        self.listen_thread = threading.Thread(
+            target=lambda: self.http_server.serve_forever())
+        self.listen_thread.start()
+
+    def stop(self):
+        """
+        stop server and clear its resources.
+        """
+        self.http_server.shutdown()
+        self.listen_thread.join()
+        self.http_server.server_close()
+
+    def shoud_stop(self):
+        """
+        return whether the server should stop.
+
+        Returns:
+            ret(bool): whether the server should stop
+        """
+        for key in self.size:
+            s = self.http_server.get_deleted_size(key)
+            if s != self.size.get(key, 0):
+                return False
+        return True
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -166,17 +166,34 @@ def __bootstrap__():
    os.environ['OMP_NUM_THREADS'] = str(num_threads)
    sysstr = platform.system()
    read_env_flags = [
-        'check_nan_inf', 'fast_check_nan_inf', 'benchmark',
-        'eager_delete_scope', 'fraction_of_cpu_memory_to_use',
-        'initial_cpu_memory_in_mb', 'init_allocated_mem', 'paddle_num_threads',
-        'dist_threadpool_size', 'eager_delete_tensor_gb',
-        'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
-        'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
-        'enable_parallel_graph', 'fuse_parameter_groups_size',
-        'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
-        'tracer_profile_fname', 'dygraph_debug', 'use_system_allocator',
-        'enable_unused_var_check', 'free_idle_chunk', 'free_when_no_cache_hit'
+        'check_nan_inf',
+        'fast_check_nan_inf',
+        'benchmark',
+        'eager_delete_scope',
+        'fraction_of_cpu_memory_to_use',
+        'initial_cpu_memory_in_mb',
+        'init_allocated_mem',
+        'paddle_num_threads',
+        'dist_threadpool_size',
+        'eager_delete_tensor_gb',
+        'fast_eager_deletion_mode',
+        'memory_fraction_of_eager_deletion',
+        'allocator_strategy',
+        'reader_queue_speed_test_mode',
+        'print_sub_graph_dir',
+        'pe_profile_fname',
+        'inner_op_parallelism',
+        'enable_parallel_graph',
+        'fuse_parameter_groups_size',
+        'multiple_of_cupti_buffer_size',
+        'fuse_parameter_memory_size',
+        'tracer_profile_fname',
+        'dygraph_debug',
+        'use_system_allocator',
+        'enable_unused_var_check',
+        'free_idle_chunk',
+        'free_when_no_cache_hit',
+        'call_stack_level',
    ]
    if 'Darwin' not in sysstr:
        read_env_flags.append('use_pinned_memory')
@@ -208,12 +225,19 @@ def __bootstrap__():

    if core.is_compiled_with_cuda():
        read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
-            'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
-            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search', 'selected_gpus', 'sync_nccl_allreduce',
-            'cudnn_batchnorm_spatial_persistent', 'gpu_allocator_retry_time',
-            'local_exe_sub_scope_limit', 'gpu_memory_limit_mb'
+            'fraction_of_gpu_memory_to_use',
+            'initial_gpu_memory_in_mb',
+            'reallocate_gpu_memory_in_mb',
+            'cudnn_deterministic',
+            'enable_cublas_tensor_op_math',
+            'conv_workspace_size_limit',
+            'cudnn_exhaustive_search',
+            'selected_gpus',
+            'sync_nccl_allreduce',
+            'cudnn_batchnorm_spatial_persistent',
+            'gpu_allocator_retry_time',
+            'local_exe_sub_scope_limit',
+            'gpu_memory_limit_mb',
        ]
    core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
    core.init_glog(sys.argv[0])

--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -16,12 +16,13 @@ from __future__ import print_function

 import os
 import collections
-from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase
+from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer
 import pickle
 import six
 from . import learning_rate_scheduler
 import warnings
 from .. import core
+from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME, _load_persistable_vars

 __all__ = [
    'save_dygraph',
@@ -140,22 +141,83 @@ def load_dygraph(model_path, keep_name_table=False):
    elif model_prefix.endswith(".pdopt"):
        model_prefix = model_prefix[:-6]

-    params_file_path = model_prefix + ".pdparams"
-    if not os.path.exists(params_file_path):
-        raise RuntimeError("Parameter file [ {} ] not exists".format(
-            params_file_path))
-
-    with open(params_file_path, 'rb') as f:
-        para_dict = pickle.load(f) if six.PY2 else pickle.load(
-            f, encoding='latin1')
-
-    if not keep_name_table and "StructuredToParameterName@@" in para_dict:
-        del para_dict["StructuredToParameterName@@"]
+    para_dict = None
    opti_dict = None
+    params_file_path = model_prefix + ".pdparams"
    opti_file_path = model_prefix + ".pdopt"
-    if os.path.exists(opti_file_path):
-        with open(opti_file_path, 'rb') as f:
-            opti_dict = pickle.load(f) if six.PY2 else pickle.load(
-                f, encoding='latin1')
+    if not os.path.exists(params_file_path) and not os.path.exists(
+            opti_file_path):
+        # Load state dict by `jit.save` save format
+        # TODO(chenweihang): [Why not support `io.save_infernece_model` save format here]
+        # The model saved by `save_inference_model` does not completely correspond to 
+        # the information required by the `state_dict` under the dygraph. 
+        # Although we reluctantly restore the `state_dict` in some scenarios, 
+        # this may not be complete and there are some limitations, so this function 
+        # will be considered later. The limitations include:
+        #   1. `save_inference_model` not save structured name, we need to remind 
+        # the user to configure the `use_structured_name` argument when `set_dict`, 
+        # but this argument is currently not public
+        #   2. if `save_inference_model` save all persistable variables in a single file,
+        # user need to give the variable name list to load `state_dict`
+
+        # 1. check model path
+        if not os.path.isdir(model_prefix):
+            raise ValueError("Model saved directory '%s' is not exists." %
+                             model_prefix)
+        # 2. load `__variables.info__`
+        var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
+        if not os.path.exists(var_info_path):
+            raise RuntimeError(
+                "No target can be loaded. Now only supports loading `state_dict` from "
+                "the result saved by `imperative.save` and `imperative.jit.save`."
+            )
+        with open(var_info_path, 'rb') as f:
+            extra_var_info = pickle.load(f)
+        # 3. load `__variables__`
+        # TODO(chenweihang): now only supports loading from default save format:
+        # - all persistable vars saved in one file named `__variables__`
+        # for other case, we may need to modify the arguments of this API
+        var_file_path = os.path.join(model_prefix, VARIABLE_FILENAME)
+        if not os.path.exists(var_file_path):
+            raise RuntimeError(
+                "The parameter file to be loaded was not found. "
+                "Now only supports loading from the default save format, "
+                "and does not support custom params_filename and "
+                "save parameters separately.")
+        # 4. load all persistable vars
+        load_var_list = []
+        for name in sorted(extra_var_info):
+            var = _varbase_creator(name=name, persistable=True)
+            load_var_list.append(var)
+        _dygraph_tracer().trace_op(
+            type='load_combine',
+            inputs={},
+            outputs={'Out': load_var_list},
+            attrs={'file_path': var_file_path})
+        # 5. construct state_dict
+        para_dict = dict()
+        for var in load_var_list:
+            structured_name = extra_var_info[var.name].get('structured_name',
+                                                           None)
+            if structured_name is None:
+                raise RuntimeError(
+                    "Cannot find saved variable (%s)'s structured name in saved model.",
+                    var.name)
+            para_dict[structured_name] = var.numpy()
+        # NOTE: `jit.save` doesn't save optimizer state
+    else:
+        # Load state dict by `save_dygraph` save format
+        if os.path.exists(params_file_path):
+            with open(params_file_path, 'rb') as f:
+                para_dict = pickle.load(f) if six.PY2 else pickle.load(
+                    f, encoding='latin1')
+
+        if not keep_name_table and "StructuredToParameterName@@" in para_dict:
+            del para_dict["StructuredToParameterName@@"]
+
+        if os.path.exists(opti_file_path):
+            with open(opti_file_path, 'rb') as f:
+                opti_dict = pickle.load(f) if six.PY2 else pickle.load(
+                    f, encoding='latin1')

    return para_dict, opti_dict
--- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import traceback
+
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import Location, OriginInfo, global_origin_info_map
+
+ERROR_DATA = "Error data about original source code information and traceback."
+
+
+def attach_error_data(error, in_runtime=False):
+    """
+    Attachs error data about original source code information and traceback to an error.
+
+    Args:
+        error(Exception): An native error.
+        in_runtime(bool): `error` is raised in runtime if in_runtime is True, otherwise in compile time
+    Returns:
+        An error attached data about original source code information and traceback.
+    """
+    e_type, e_value, e_traceback = sys.exc_info()
+    tb = traceback.extract_tb(e_traceback)[1:]
+
+    error_data = ErrorData(e_type, e_value, tb, global_origin_info_map)
+    error_data.in_runtime = in_runtime
+
+    setattr(error, ERROR_DATA, error_data)
+
+    return error
+
+
+class TraceBackFrame(OriginInfo):
+    """
+    Traceback frame information.
+    """
+
+    def __init__(self, location, function_name, source_code):
+        self.location = location
+        self.function_name = function_name
+        self.source_code = source_code
+
+
+class ErrorData(object):
+    """
+    Error data attached to an exception which is raised in un-transformed code.
+    """
+
+    def __init__(self, error_type, error_value, origin_traceback,
+                 origin_info_map):
+        self.error_type = error_type
+        self.error_value = error_value
+        self.origin_traceback = origin_traceback
+        self.origin_info_map = origin_info_map
+        self.in_runtime = False
+
+    def create_exception(self):
+        message = self.create_message()
+        new_exception = self.error_type(message)
+        setattr(new_exception, ERROR_DATA, self)
+        return new_exception
+
+    def create_message(self):
+        """
+        Creates a custom error message which includes trace stack with source code information of dygraph from user.
+        """
+        message_lines = []
+
+        # Step1: Adds header message to prompt users that the following is the original information.
+        header_message = "In user code:"
+        message_lines.append(header_message)
+        message_lines.append("")
+
+        # Simplify error value to improve readability if error is raised in runtime
+        if self.in_runtime:
+            self._simplify_error_value()
+            message_lines.append(str(self.error_value))
+            return '\n'.join(message_lines)
+
+        # Step2: Optimizes stack information with source code information of dygraph from user.
+        for filepath, lineno, funcname, code in self.origin_traceback:
+            loc = Location(filepath, lineno)
+
+            dygraph_func_info = self.origin_info_map.get(loc.line_location,
+                                                         None)
+            if dygraph_func_info:
+                # TODO(liym27): more information to prompt users that this is the original information.
+                # Replaces trace stack information about transformed static code with original dygraph code.
+                traceback_frame = self.origin_info_map[loc.line_location]
+            else:
+                traceback_frame = TraceBackFrame(loc, funcname, code)
+
+            message_lines.append(traceback_frame.formated_message())
+
+        # Step3: Adds error message like "TypeError: dtype must be int32, but received float32".
+        error_message = " " * 4 + traceback.format_exception_only(
+            self.error_type, self.error_value)[0].strip("\n")
+        message_lines.append(error_message)
+
+        return '\n'.join(message_lines)
+
+    def _simplify_error_value(self):
+        """
+        Simplifies error value to improve readability if error is raised in runtime.
+
+        NOTE(liym27): The op callstack information about transformed static code has been replaced with original dygraph code.
+
+        TODO(liym27):
+            1. Need a more robust way because the code of start_trace may change.
+            2. Set the switch to determine whether to simplify error_value
+        """
+        assert self.in_runtime is True
+
+        error_value_lines = str(self.error_value).split("\n")
+        error_value_lines_strip = [mes.lstrip(" ") for mes in error_value_lines]
+
+        start_trace = "outputs = static_func(*inputs)"
+        start_idx = error_value_lines_strip.index(start_trace)
+        error_value_lines = error_value_lines[start_idx + 1:]
+
+        error_value_str = '\n'.join(error_value_lines)
+        self.error_value = self.error_type(error_value_str)
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -39,32 +39,21 @@ GENERATE_VARIABLE_PREFIX = 'generate_variable'


 def create_while_node(condition_name, body_name, loop_var_names):
-    while_args = []
-    while_args.append(
-        gast.Name(
-            id=condition_name,
-            ctx=gast.Param(),
-            annotation=None,
-            type_comment=None))
-    while_args.append(
-        gast.Name(
-            id=body_name, ctx=gast.Param(), annotation=None, type_comment=None))
-    assign_targets = [
-        gast.Name(
-            id=var_name, ctx=gast.Param(), annotation=None, type_comment=None)
-        for var_name in loop_var_names
-    ]
-    while_args.append(gast.List(elts=assign_targets, ctx=gast.Param()))
-
-    while_func_id = gast.parse(
-        'fluid.dygraph.dygraph_to_static.convert_operators.convert_while_loop'
-    ).body[0].value
-    while_node = gast.Call(func=while_func_id, args=while_args, keywords=[])
-    assign_node = gast.Assign(
-        targets=[gast.Tuple(
-            elts=assign_targets, ctx=gast.Store())],
-        value=while_node)
-    return assign_node
+    # NOTE(liym27):
+    # It's better to parse the source code into an AST node than to customize an AST node
+    # including child nodes, because it is easy to mistake the ast node type when customizing the node.
+    #
+    # For example: loop_var_names = [a, b, foo.x], the type of `a` or `b` is gast.Name,
+    # but the type of `foo.x` gast.Attribute.
+
+    while_func_name = "fluid.dygraph.dygraph_to_static.convert_operators.convert_while_loop"
+    while_node_str = "[{}] = {}({}, {}, [{}])".format(
+        ",".join(loop_var_names), while_func_name, condition_name, body_name,
+        ",".join(loop_var_names))
+
+    while_node = gast.parse(while_node_str).body[0]
+
+    return while_node


 class NameVisitor(gast.NodeVisitor):

--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -19,8 +19,12 @@ import inspect

 import gast

+from paddle.fluid import core
+from paddle.fluid.framework import Program
+
 # NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node.
 ORIGI_INFO = "Original information of source code for ast node."
+ORIGI_INFO_MAP = "Original information map of source code."


 class Location(object):
@@ -64,6 +68,15 @@ class OriginInfo(object):
        return "{} \nsource_code: {}  in function {}\n  ".format(
            self.location, self.source_code, self.function_name)

+    def formated_message(self):
+        return '    File "{}", line {}, in {}\n\t{}'.format(
+            self.location.filepath, self.location.lineno, self.function_name,
+            self.source_code.lstrip())
+
+    def as_frame(self):
+        return (self.location.filepath, self.location.lineno,
+                self.function_name, self.source_code.lstrip())
+

 class OriginInfoAttacher(gast.NodeTransformer):
    """
@@ -119,7 +132,12 @@ class OriginInfoAttacher(gast.NodeTransformer):
        return self.col_offset + node.col_offset


-def create_origin_info_map(transformed_node, static_func):
+global_origin_info_map = {}
+
+
+def create_and_update_origin_info_map(transformed_node,
+                                      static_func,
+                                      is_global=True):
    """
    Creates a original information map between transformed static function and original dygraph function.

@@ -156,6 +174,10 @@ def create_origin_info_map(transformed_node, static_func):

        origin_info_map[static_loc] = dygraph_info

+    global_origin_info_map.update(origin_info_map)
+    if is_global:
+        return global_origin_info_map
+
    return origin_info_map


@@ -234,3 +256,63 @@ def ast_walk(transformed_node, static_node):
                    if isinstance(d_item, gast.AST):
                        transformed_node_list.append(d_item)
                        static_node_list.append(s_item)
+
+
+def update_op_callstack_with_origin_info(program):
+    """
+    Replaces op callstack information about transformed static code with original dygraph code.
+    """
+
+    assert isinstance(program, Program)
+
+    def get_new_op_callstack(callstack):
+        """
+        An example of callstack:
+
+            File "path1/to/file.py", line 10, in func_1
+                y = fluid.layers.fill_constant(x, shape=[1], dtype="int32")
+            File "path2/to/file.py", line 740, in fill_constant
+                stop_gradient=True)
+            File "path3/to/file.py", line 43, in append_op
+              return self.main_program.current_block().append_op(*args, **kwargs)
+            File "path4/to/file.py", line 2811, in append_op
+              attrs=kwargs.get("attrs", None))
+            File "path5/to/file.py", line 1919, in __init__
+              for frame in traceback.extract_stack():
+        """
+
+        assert len(callstack) % 2 == 0
+        for i in range(0, len(callstack), 2):
+
+            file_line = callstack[i].lstrip(" ").split(",")
+
+            filepath = file_line[0][6:-1]
+            lineno = int(file_line[1][6:])
+            funcname = file_line[2][4:]
+            code = callstack[i + 1].lstrip(" ")
+
+            loc = Location(filepath, lineno)
+            dygraph_func_info = global_origin_info_map.get(loc.line_location)
+            if dygraph_func_info:
+                filepath, lineno, funcname, code = \
+                    dygraph_func_info.as_frame()
+
+            callstack[i] = '  File "{}", line {}, in {}'.format(
+                filepath, lineno, funcname)
+            callstack[i + 1] = '    {}'.format(code)
+
+        return callstack
+
+    op_maker = core.op_proto_and_checker_maker
+    callstack_var_name = op_maker.kOpCreationCallstackAttrName()
+
+    for block in program.blocks:
+        for i, op in enumerate(block.ops):
+            if op.has_attr(callstack_var_name):
+                callstack = op.attr(callstack_var_name)
+
+                callstack = get_new_op_callstack(callstack)
+
+                op._set_attr(callstack_var_name, callstack)
+
+    return program
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -130,8 +130,6 @@ class PartialProgramLayer(layers.Layer):
        self._check_params_all_inited(main_program)
        # 2. Prune the parameters not used anywhere in the program.
        self._prune_unused_params(main_program)
-        # 3. Remove op's python call stack with redundant low-level error messages.
-        main_program = self._remove_op_call_stack(main_program)

        return main_program


--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -47,8 +47,7 @@ class PrintTransformer(gast.NodeTransformer):
    # NOTE: deal with print in PY3
    def visit_Call(self, node):
        if isinstance(node.func, gast.Name) and node.func.id == 'print':
-            convert_print_node = self._create_print_node(node.args)
-            return gast.Expr(value=convert_print_node)
+            node = self._create_print_node(node.args)
        return node

    # NOTE: deal with print in PY2

--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -36,6 +36,9 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 from paddle.fluid.dygraph.base import param_guard
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info, create_and_update_origin_info_map
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info
+from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data, ERROR_DATA

 __all__ = ['ProgramTranslator', 'convert_to_static']

@@ -88,15 +91,23 @@ class FunctionCache(object):
        # with decorator directly and function.__wrapped__ holds the actual function.
        func = getattr(func, '__wrapped__', func)
        source_code = func_to_source_code(func)
+
+        # TODO(liym27):
+        #  Consider this case: source_code in self._code_to_ast_caches,
+        #  but actually they are methods in different classes.
+        #  Maybe use (__class__, source_code) as key
        if source_code in self._code_to_ast_caches:
            root_wrapper = self._code_to_ast_caches[source_code]
        else:
            root = gast.parse(source_code)
+            root = attach_origin_info(root, func)
            root_wrapper = self._dygraph_to_static.get_static_ast(root)
            self._code_to_ast_caches[source_code] = root_wrapper

        # Get static function from AST
        static_func, file_name = ast_to_func(root_wrapper.node, func)
+
+        create_and_update_origin_info_map(root_wrapper.node, static_func)
        return static_func

    def exist(self, func):
@@ -125,6 +136,7 @@ class FunctionSpec(object):
        self._args = args
        self._kwargs = kwargs

+        # TODO(liym27): func has multi layer decorator
        dyfunc = getattr(func, '__wrapped__', func)
        self._dyfunc_code = inspect.getsource(dyfunc)

@@ -282,11 +294,19 @@ class ConcreteProgram(object):
                # 3. Builds program only once and returns the output Variables.
                with param_guard(func_spec.parameters(False)), param_guard(
                        func_spec.buffers(False)):
-                    outputs = static_func(*inputs)
+                    try:
+                        outputs = static_func(*inputs)
+                    except BaseException as e:
+                        # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
+                        attach_error_data(e)
+                        raise
+
                if not isinstance(outputs,
                                  (tuple, list)) and outputs is not None:
                    outputs = [outputs]

+        main_program = update_op_callstack_with_origin_info(main_program)
+
        return ConcreteProgram(
            inputs=inputs,
            outputs=outputs,
@@ -483,14 +503,24 @@ class ProgramTranslator(object):
            return dygraph_func(*args, **kwargs)

        function_spec = FunctionSpec(dygraph_func, args, kwargs)
-        _, partial_program_layer = self._program_cache[function_spec]
+        concrete_program, partial_program_layer = self._program_cache[
+            function_spec]

        if args and isinstance(args[0], layers.Layer):
            # Synchronize self.training attribute.
            partial_program_layer.training = args[0].training
            args = args[1:]
-
-        return partial_program_layer(args)
+        try:
+            return partial_program_layer(args)
+
+        except BaseException as e:
+            # NOTE:
+            # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
+            # 2. If e raised in runtime, e should be attached to ERROR_DATA here.
+            if not hasattr(e, ERROR_DATA):
+                # runtime error
+                attach_error_data(e, in_runtime=True)
+            raise

    def get_func(self, dygraph_func):
        """

--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -425,8 +425,7 @@ def _load_persistable_vars(model_path,
                           params_filename=None):
    # 1. load extra var info
    with open(var_info_path, 'rb') as f:
-        extra_var_info = pickle.load(f) if six.PY2 else pickle.load(
-            f, encoding='latin1')
+        extra_var_info = pickle.load(f)

    # 2. construct var dict
    load_var_dict = dict()

--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -15,20 +15,23 @@
 from __future__ import print_function

 import os
-import six
 import pickle
-
 import warnings
+
+import six
 from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
+from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import FunctionSpec, ProgramTranslator
+from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
-from paddle.fluid.framework import Program, Block, Variable, ParamBase, _dygraph_tracer, dygraph_only, _dygraph_guard, _current_expected_place, in_dygraph_mode
+from paddle.fluid.framework import Block, ParamBase, Program, Variable
+from paddle.fluid.framework import _current_expected_place, _dygraph_guard, _dygraph_tracer
+from paddle.fluid.framework import dygraph_only, in_dygraph_mode
 from paddle.fluid.wrapped_decorator import wrap_decorator
-from paddle.fluid.dygraph.io import TranslatedLayer, VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME

 __all__ = ['TracedLayer', 'declarative', 'dygraph_to_static_func']

@@ -167,7 +170,25 @@ def _declarative_(dygraph_func):
                "The decorator 'declarative' doesn't work when setting ProgramTranslator.enable=False. "
                "We will just return dygraph output.")
            return dygraph_func(*args, **kwargs)
-        return program_translator.get_output(dygraph_func, *args, **kwargs)
+        try:
+            return program_translator.get_output(dygraph_func, *args, **kwargs)
+        except Exception as e:
+            error_data = getattr(e, ERROR_DATA, None)
+            if error_data:
+                new_exception = error_data.create_exception()
+                if six.PY3:
+                    # NOTE(liym27):
+                    # 1. Why `raise new_exception from None`?
+                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
+                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
+                    #   caught exception.
+                    # 2. Use exec to bypass syntax error checking in Python 2.
+
+                    six.exec_("raise new_exception from None")
+                else:
+                    raise new_exception
+            else:
+                raise

    return __impl__


--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@@ -21,7 +21,7 @@ from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGD

 from paddle.fluid.incubate.fleet.base.mode import Mode
-from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase
+from paddle.fleet.base.role_maker import RoleMakerBase
 from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecision
 from . import mode

@@ -209,7 +209,10 @@ class Fleet(object):
        self._executor = Executor(fluid.CPUPlace())

        if role_maker and not isinstance(role_maker, RoleMakerBase):
-            raise TypeError("role_maker must be an instance of RoleMakerBase")
+            from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase as RoleMakerBaseIncubate
+            if role_maker and not isinstance(role_maker, RoleMakerBaseIncubate):
+                raise TypeError(
+                    "role_maker must be an instance of RoleMakerBase")

        self._role_maker = role_maker
        self._role_maker.generate_role()

--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -579,7 +579,7 @@ class FleetTranspiler(Fleet):
                block.append_op(
                    type='recv_save',
                    attrs={
-                        "trainer_id": self._role_maker.worker_id(),
+                        "trainer_id": self._role_maker.worker_index(),
                        "shape": var.shape,
                        "slice_shapes":
                        [",".join([str(i) for i in var.shape])],

--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -329,7 +329,7 @@ class CompileTimeStrategy(object):

                is_distributed = True if param_name in distibuted_varnames else False

-                ctx = self.build_ctx(grad, self.grad_var_mapping, True, False,
+                ctx = self.build_ctx(grad, self.grad_var_mapping, True, True,
                                     True, is_distributed)
                send_ctx[ctx.var_name()] = ctx


--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6200,7 +6200,7 @@ def squeeze(input, axes, name=None):
            Out.shape = [1,3,5]

    Args:
-        input (Variable): The input Tensor. Support data type: float16, float32, float64, int8, int32, int64.
+        input (Variable): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64.
                          axes (list): One integer or List of integers, indicating the dimensions to be squeezed.
                          Axes range is :math:`[-rank(input), rank(input))`.
                          If axes is negative, :math:`axes=axes+rank(input)`.
@@ -6226,8 +6226,9 @@ def squeeze(input, axes, name=None):
    helper = LayerHelper("squeeze", **locals())
    check_variable_and_dtype(
        input, 'input',
-        ['float16', 'float32', 'float64', 'int8', 'int32', 'int64'], 'squeeze')
-    check_type(axes, 'axes', (list, tuple), 'squeeze')
+        ['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'],
+        'squeeze')
+    check_type(axes, 'axis/axes', (list, tuple), 'squeeze')
    out = helper.create_variable_for_type_inference(dtype=input.dtype)
    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
    helper.append_op(
@@ -6254,12 +6255,12 @@ def unsqueeze(input, axes, name=None):
      then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].

    Args:
-        input (Variable): The input Tensor to be unsqueezed. It is a N-D Tensor of data types float32, float64, int32.
+        input (Variable): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64.
        axes (int|list|tuple|Variable): Indicates the dimensions to be inserted. The data type is ``int32`` . If ``axes`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. If ``axes`` is an Variable, it should be an 1-D Tensor .
        name (str|None): Name for this layer.

    Returns:
-        Variable: Output unsqueezed Tensor, with data type being float32, float64, int32, int64.
+        Variable: Unsqueezed Tensor, with the same data type as input.

    Examples:
        .. code-block:: python
@@ -6269,10 +6270,15 @@ def unsqueeze(input, axes, name=None):
            y = fluid.layers.unsqueeze(input=x, axes=[1])

    """
-    if not isinstance(axes, (int, list, tuple, Variable)):
-        raise TypeError(
-            "The type of 'axes' in unsqueeze must be int, list, tuple or Variable, but "
-            "received %s." % (type(axes)))
+    if in_dygraph_mode():
+        out, _ = core.ops.unsqueeze2(input, 'axes', axes)
+        return out
+
+    check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
+    check_variable_and_dtype(
+        input, 'input',
+        ['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'],
+        'unsqueeze')
    helper = LayerHelper("unsqueeze2", **locals())
    inputs = {"X": input}
    attrs = {}
@@ -9966,7 +9972,7 @@ def stack(x, axis=0, name=None):
                                     must be the same. Supposing input is N dims
                                     Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims
                                     Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
-                                     Support data types: float32, float64, int32, int64.
+                                     Supported data types: float32, float64, int32, int64.
        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`.
                              R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`.
                              The default value of axis is 0.

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -685,8 +685,9 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
    """

    attrs = {'force_cpu': force_cpu}
+    dtype = convert_dtype(dtype)
    if not isinstance(value, Variable):
-        if convert_dtype(dtype) in ['int64', 'int32']:
+        if dtype in ['int64', 'int32']:
            attrs['str_value'] = str(int(value))
        else:
            attrs['str_value'] = str(float(value))
@@ -697,7 +698,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
            out = _varbase_creator(dtype=dtype)

        if isinstance(value, Variable):
-            if convert_dtype(dtype) in ['int64', 'int32']:
+            if dtype in ['int64', 'int32']:
                attrs['str_value'] = str(int(value.numpy()))
            else:
                attrs['str_value'] = str(float(value.numpy()))
@@ -712,6 +713,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
    helper = LayerHelper("fill_constant", **locals())
    inputs = {}
    if isinstance(value, Variable):
+        if convert_dtype(value.dtype) != dtype:
+            value = cast(value, dtype)
        inputs['ValueTensor'] = value

    check_dtype(dtype, 'dtype',

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -345,7 +345,6 @@ if(WITH_DISTRIBUTE)
    # FIXME(typhoonzero): add these tests back
    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")

    #not need
    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base")

--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -28,6 +28,7 @@ import numpy as np

 import ctr_dataset_reader
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
+from paddle.fleet.base.util_factory import fleet_util

 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
@@ -181,8 +182,14 @@ class TestDistCTR2x2(FleetDistRunnerBase):
                    loss_val = exe.run(program=compiled_prog,
                                       fetch_list=[self.avg_cost.name])
                    loss_val = np.mean(loss_val)
-                    print("TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
-                                                                  loss_val))
+                    reduce_output = fleet_util.all_reduce(
+                        np.array(loss_val), mode="sum")
+                    loss_all_trainer = fleet_util.all_gather(float(loss_val))
+                    loss_val = float(reduce_output) / len(loss_all_trainer)
+                    message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
+                                                                      loss_val)
+                    fleet_util.print_on_rank(message, 0)
+
                pass_time = time.time() - pass_start
            except fluid.core.EOFException:
                self.reader.reset()

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import inspect
+import unittest
+
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.core import EnforceNotMet
+from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA, ErrorData
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import unwrap
+from paddle.fluid.dygraph.jit import declarative
+
+
+def inner_func():
+    fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")
+    return
+
+
+@declarative
+def func_error_in_compile_time(x):
+    x = fluid.dygraph.to_variable(x)
+    inner_func()
+    if fluid.layers.mean(x) < 0:
+        x_v = x - 1
+    else:
+        x_v = x + 1
+    return x_v
+
+
+@declarative
+def func_error_in_compile_time_2(x):
+    x = fluid.dygraph.to_variable(x)
+    x = fluid.layers.reshape(x, shape=[1, 2])
+    return x
+
+
+@declarative
+def func_error_in_runtime(x, iter_num=3):
+    x = fluid.dygraph.to_variable(x)
+    two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32")
+    x = fluid.layers.reshape(x, shape=[1, two])
+    return x
+
+
+class TestErrorInCompileTime(unittest.TestCase):
+    def setUp(self):
+        self.set_func()
+        self.set_input()
+        self.set_exception_type()
+
+    def set_func(self):
+        self.func = func_error_in_compile_time
+
+    def set_exception_type(self):
+        self.exception_type = TypeError
+
+    def set_input(self):
+        self.input = np.ones([3, 2])
+
+    def set_message(self):
+        self.expected_message = \
+            ['File "{}", line 36, in func_error_in_compile_time'.format(self.filepath),
+            'inner_func()',
+            'File "{}", line 29, in inner_func'.format(self.filepath),
+            'fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")',
+            ]
+
+    def _test_create_message(self, error_data):
+        self.filepath = inspect.getfile(unwrap(self.func))
+        self.set_message()
+        error_message = error_data.create_message()
+
+        self.assertIn('In user code:', error_message)
+        for m in self.expected_message:
+            self.assertIn(m, error_message)
+
+    def test(self):
+        with fluid.dygraph.guard():
+            with self.assertRaises(self.exception_type) as cm:
+                self.func(self.input)
+            exception = cm.exception
+            error_data = getattr(exception, ERROR_DATA)
+            self.assertIsInstance(error_data, ErrorData)
+            self._test_create_message(error_data)
+
+
+class TestErrorInCompileTime2(TestErrorInCompileTime):
+    def set_func(self):
+        self.func = func_error_in_compile_time_2
+
+    def set_exception_type(self):
+        self.exception_type = EnforceNotMet
+
+    def set_message(self):
+
+        self.expected_message = \
+            [
+             'File "{}", line 47, in func_error_in_compile_time_2'.format(self.filepath),
+             'x = fluid.layers.reshape(x, shape=[1, 2])'
+             ]
+
+
+class TestErrorInRuntime(TestErrorInCompileTime):
+    def set_func(self):
+        self.func = func_error_in_runtime
+
+    def set_exception_type(self):
+        self.exception_type = EnforceNotMet
+
+    def set_message(self):
+        self.expected_message = \
+            [
+                'File "{}", line 55, in func_error_in_runtime'.format(self.filepath),
+                'x = fluid.layers.reshape(x, shape=[1, two])'
+            ]
+
+    def _test_create_message(self, error_data):
+        self.filepath = inspect.getfile(unwrap(self.func))
+        self.set_message()
+
+        with self.assertRaises(ValueError):
+            error_data.create_message()
+
+        error_data.in_runtime = False
+        error_message = error_data.create_message()
+
+        self.assertIn('In user code:', error_message)
+        for m in self.expected_message:
+            self.assertIn(m, error_message)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
@@ -90,7 +90,8 @@ class TestOriginInfo(unittest.TestCase):

        # step3
        self.static_func, _ = ast_to_func(transformed_ast, self.dygraph_func)
-        info_map = create_origin_info_map(dygraph_ast, self.static_func)
+        info_map = create_and_update_origin_info_map(dygraph_ast,
+                                                     self.static_func)

        return info_map


--- a/python/paddle/fluid/tests/unittests/multi_process.py
+++ b/python/paddle/fluid/tests/unittests/multi_process.py
@@ -17,7 +17,7 @@ import sys
 import time


-def train():
+def train(prefix):
    selected_gpus = os.getenv("FLAGS_selected_gpus")
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
@@ -29,11 +29,12 @@ def train():
        .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)

    print(name)
-    with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
+    with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+              "w") as f:
        f.write(name)


-def train_abort():
+def train_abort(prefix):
    selected_gpus = os.getenv("FLAGS_selected_gpus")
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
@@ -49,8 +50,9 @@ def train_abort():
            name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
                .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
            print(name)
-            with open("multi_process.check_{}.log".format(trainer_id),
-                      "w") as f:
+            with open(
+                    "multi_process_{}.check_{}.log".format(prefix, trainer_id),
+                    "w") as f:
                f.write(name)
            raise
    else:
@@ -60,12 +62,15 @@ def train_abort():
            .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)

        print(name)
-        with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
+        with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+                  "w") as f:
            f.write(name)


 if __name__ == '__main__':
-    if len(sys.argv) == 2 and sys.argv[1] == "abort":
-        train_abort()
+    if len(sys.argv) == 3 and sys.argv[2] == "abort":
+        prefix = sys.argv[1]
+        train_abort(prefix)
    else:
-        train()
+        prefix = sys.argv[1]
+        train(prefix)
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -63,18 +63,104 @@ class TestAddMMOpError(unittest.TestCase):
    def test_errors(self):
        with program_guard(Program(), Program()):
            # The input type of addmm_op must be Variable.
+
            input = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
            x2 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
            self.assertRaises(TypeError, paddle.addmm, input, x1, x2)
+
            # The input dtype of mul_op must be float32 or float64.
-            input = fluid.layers.data(name='input', shape=[4], dtype="int32")
-            x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32")
-            x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32")
+            input = fluid.layers.data(
+                name='input',
+                shape=[4, 4],
+                dtype="int32",
+                append_batch_size=False)
+            x3 = fluid.layers.data(
+                name='x3', shape=[4, 4], dtype="int32", append_batch_size=False)
+            x4 = fluid.layers.data(
+                name='x4', shape=[4, 4], dtype="int32", append_batch_size=False)
            self.assertRaises(TypeError, paddle.addmm, input, x3, x4)
+            # x and y dimension mismatch
+            x5 = fluid.layers.data(
+                name='x5',
+                shape=[4, 5],
+                dtype="float32",
+                append_batch_size=False)
+            x6 = fluid.layers.data(
+                name='x6',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input, x5, x6)
+            # input and x are not broadcastable
+            x7 = fluid.layers.data(
+                name='x7',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x8 = fluid.layers.data(
+                name='x8',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input1 = fluid.layers.data(
+                name='input1',
+                shape=[2, 4],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input1, x7, x8)
+            # input and x are not broadcastable
+            x9 = fluid.layers.data(
+                name='x9',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x10 = fluid.layers.data(
+                name='x10',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input2 = fluid.layers.data(
+                name='input2',
+                shape=[1, 2],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input2, x9, x10)
+            x11 = fluid.layers.data(
+                name='x11',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x12 = fluid.layers.data(
+                name='x12',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input3 = fluid.layers.data(
+                name='input3',
+                shape=[4, 2],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input3, x11, x12)
+            x13 = fluid.layers.data(
+                name='x13',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x14 = fluid.layers.data(
+                name='x14',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input4 = fluid.layers.data(
+                name='input4',
+                shape=[3, 1],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input4, x13, x14)


 class TestAddMMOp2(TestAddMMOp):
@@ -147,5 +233,23 @@ class TestAddMMOp4(unittest.TestCase):
            assert np.allclose(np_input + np.dot(np_x, np_y), out.numpy())


+'''
+class TestAddMMAPI(unittest.TestCase):
+    def test_api_error(self):
+        data_x = np.ones((2, 2)).astype(np.float32)
+        data_y = np.ones((2, 2)).astype(np.float32)
+        data_input = np.ones((2, 2)).astype(np.float32)
+
+        paddle.enable_imperative()
+
+        def test_error1():
+            data_x_wrong = np.ones((2, 3)).astype(np.float32)
+            x = paddle.imperative.to_variable(data_x_wrong)
+            y = paddle.imperative.to_variable(data_y)
+            input = paddle.imperative.to_variable(data_input)
+            out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
+        self.assertRaises(ValueError, test_error1)
+'''
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_bmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bmm_op.py
@@ -73,5 +73,15 @@ class API_TestDygraphBmm(unittest.TestCase):
        self.assertTrue(np.allclose(expected_result, out_np))


+class TestBmmAPIError(unittest.TestCase):
+    def test_api_error(self):
+        x_data = np.arange(24, dtype='float32').reshape((2, 3, 4))
+        y_data = np.arange(16, dtype='float32').reshape((2, 4, 2))
+        y_data_wrong1 = np.arange(16, dtype='float32').reshape((2, 2, 4))
+        y_data_wrong2 = np.arange(16, dtype='float32').reshape((2, 2, 2, 2))
+        self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong1)
+        self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong2)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -21,6 +21,9 @@ import os
 import sys
 import subprocess

+import six
+import shutil
+import numpy as np
 import argparse
 from contextlib import closing
 import socket
@@ -29,7 +32,8 @@ import tempfile
 import unittest

 import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.fleet.base.role_maker as role_maker
+from paddle.fleet.base.util_factory import fleet_util
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory

@@ -48,18 +52,26 @@ class FleetDistRunnerBase(object):
    """

    def build_role(self, args):
+
        if args.role.upper() == "PSERVER":
            role = role_maker.UserDefinedRoleMaker(
+                is_collective=False,
+                init_gloo=True,
+                path=args.gloo_path,
                current_id=args.current_id,
                role=role_maker.Role.SERVER,
-                worker_num=args.trainers,
+                worker_endpoints=args.trainer_endpoints.split(","),
                server_endpoints=args.endpoints.split(","))
        else:
            role = role_maker.UserDefinedRoleMaker(
+                is_collective=False,
+                init_gloo=True,
+                path=args.gloo_path,
                current_id=args.current_id,
                role=role_maker.Role.WORKER,
-                worker_num=args.trainers,
+                worker_endpoints=args.trainer_endpoints.split(","),
                server_endpoints=args.endpoints.split(","))
+        self.role = role
        return role

    def build_strategy(self, args):
@@ -114,26 +126,13 @@ class FleetDistRunnerBase(object):
        optimizer.minimize(avg_cost)

    def run_pserver(self, args):
-        fleet.init(self.build_role(args))
-        strategy = self.build_strategy(args)
-        avg_cost = self.net(args)
-        self.build_optimizer(avg_cost, strategy)
-
        fleet.init_server()
        fleet.run_server()

    def run_dataset_trainer(self, args):
-        fleet.init(self.build_role(args))
-        strategy = self.build_strategy(args)
-        avg_cost = self.net(args)
-        self.build_optimizer(avg_cost, strategy)
        out = self.do_dataset_training(fleet)

    def run_pyreader_trainer(self, args):
-        fleet.init(self.build_role(args))
-        strategy = self.build_strategy(args)
-        avg_cost = self.net(args)
-        self.build_optimizer(avg_cost, strategy)
        out = self.do_pyreader_training(fleet)

    def net(self, args, batch_size=4, lr=0.01):
@@ -173,10 +172,14 @@ class TestFleetBase(unittest.TestCase):
            print("set begin_port:", DIST_UT_PORT)
            self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
                DIST_UT_PORT, DIST_UT_PORT + 1)
-            DIST_UT_PORT += 2
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT + 2, DIST_UT_PORT + 3)
+            DIST_UT_PORT += 4
        else:
            self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
                self._find_free_port(), self._find_free_port())
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())

        self._python_interp = sys.executable
        self._geo_sgd_need_push_nums = 5
@@ -236,18 +239,22 @@ class TestFleetBase(unittest.TestCase):
    def _run_cluster(self, model, envs):
        env = {'GRAD_CLIP': str(self._grad_clip_mode)}
        python_path = self._python_interp
+        gloo_path = tempfile.mkdtemp()
+
        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
            envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
            python_path += " -m coverage run --branch -p"
        env.update(envs)

-        tr_cmd = "{0} {1} --role trainer --endpoints {2} --current_id {{}} --trainers {3} --mode {4} --geo_sgd_need_push_nums {5} --reader {6}".format(
-            python_path, model, self._ps_endpoints, self._trainers, self._mode,
-            self._geo_sgd_need_push_nums, self._reader)
+        tr_cmd = "{0} {1} --role trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path)

-        ps_cmd = "{0} {1} --role pserver --endpoints {2} --current_id {{}} --trainers {3} --mode {4} --geo_sgd_need_push_nums {5} --reader {6}".format(
-            python_path, model, self._ps_endpoints, self._trainers, self._mode,
-            self._geo_sgd_need_push_nums, self._reader)
+        ps_cmd = "{0} {1} --role pserver --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path)

        # Run dist train to compare with local results
        ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
@@ -284,6 +291,7 @@ class TestFleetBase(unittest.TestCase):
        ps0.terminate()
        ps1.terminate()

+        shutil.rmtree(gloo_path)
        return 0, 0

    def check_with_place(self,
@@ -313,6 +321,9 @@ def runtime_main(test_class):
    parser.add_argument(
        '--role', type=str, required=True, choices=['pserver', 'trainer'])
    parser.add_argument('--endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--trainer_endpoints', type=str, required=False, default="")
+    parser.add_argument('--gloo_path', type=str, required=False, default="")
    parser.add_argument('--current_id', type=int, required=False, default=0)
    parser.add_argument('--trainers', type=int, required=False, default=1)
    parser.add_argument('--mode', type=str, required=False, default='geo')
@@ -322,6 +333,13 @@ def runtime_main(test_class):
    args = parser.parse_args()

    model = test_class()
+    role = model.build_role(args)
+    fleet.init(role)
+    strategy = model.build_strategy(args)
+    avg_cost = model.net(args)
+    model.build_optimizer(avg_cost, strategy)
+    fleet_util._set_strategy(strategy)
+    fleet_util._set_role_maker(role)
    if args.role == "pserver":
        model.run_pserver(args)
    else:

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -22,7 +22,7 @@ from test_dist_fleet_base import TestFleetBase

 class TestDistMnistSync2x2(TestFleetBase):
    def _setup_config(self):
-        self._mode = "sync"
+        self._mode = "async"
        self._reader = "pyreader"

    def check_with_place(self,

--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -269,18 +269,26 @@ class TestFillConstantAPI(unittest.TestCase):
        out_6 = fluid.layers.fill_constant(
            shape=shape_tensor_int64, dtype=np.float32, value=1.1)

-        val = fluid.layers.fill_constant(shape=[1], dtype=np.float32, value=1.1)
+        val1 = fluid.layers.fill_constant(
+            shape=[1], dtype=np.float32, value=1.1)
+        val2 = fluid.layers.fill_constant(
+            shape=[1], dtype=np.float64, value=1.1)
        out_7 = fluid.layers.fill_constant(
-            shape=shape_tensor_int64, dtype=np.float32, value=val)
+            shape=shape_tensor_int64, dtype=np.float32, value=val1)
+
+        out_8 = fluid.layers.fill_constant(
+            shape=shape_tensor_int64, dtype=np.float32, value=val2)

        exe = fluid.Executor(place=fluid.CPUPlace())
-        res_1, res_2, res_3, res_4, res_5, res_6, res_7 = exe.run(
+        res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8 = exe.run(
            fluid.default_main_program(),
            feed={
                "shape_tensor_int32": np.array([1, 2]).astype("int32"),
                "shape_tensor_int64": np.array([1, 2]).astype("int64"),
            },
-            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7])
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])

        assert np.array_equal(res_1, np.full([1, 2], 1.1, dtype="float32"))
        assert np.array_equal(res_2, np.full([1, 2], 1.1, dtype="float32"))
@@ -289,6 +297,31 @@ class TestFillConstantAPI(unittest.TestCase):
        assert np.array_equal(res_5, np.full([1, 2], 1.1, dtype="float32"))
        assert np.array_equal(res_6, np.full([1, 2], 1.1, dtype="float32"))
        assert np.array_equal(res_7, np.full([1, 2], 1.1, dtype="float32"))
+        assert np.array_equal(res_8, np.full([1, 2], 1.1, dtype="float32"))
+
+
+class TestFillConstantImperative(unittest.TestCase):
+    def test_api(self):
+        with fluid.dygraph.guard():
+            data1 = np.array([1, 2]).astype('int32')
+            data2 = np.array([1.1]).astype('float32')
+            shape = fluid.dygraph.to_variable(data1)
+            val = fluid.dygraph.to_variable(data2)
+            res1 = fluid.layers.fill_constant(
+                shape=[1, 2], dtype='float32', value=1.1)
+            res2 = fluid.layers.fill_constant(
+                shape=shape, dtype='float32', value=1.1)
+            res3 = fluid.layers.fill_constant(
+                shape=shape, dtype='float32', value=val)
+            assert np.array_equal(
+                res1.numpy(), np.full(
+                    [1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(
+                res2.numpy(), np.full(
+                    [1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(
+                res3.numpy(), np.full(
+                    [1, 2], 1.1, dtype="float32"))


 class TestFillConstantOpError(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
@@ -4,7 +4,6 @@ set -e

 function test_launch_ps(){
    fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
-
    if grep -q "server are killed" ut.elog; then
        echo "test pserver launch succeed"
    else
@@ -20,7 +19,7 @@ fi

 test_launch_ps
 # use default values
-fleetrun multi_process.py
+fleetrun multi_process.py fleetrun

 # use paddlecloud
 echo "begin test use paddlecloud"
@@ -30,16 +29,16 @@ export POD_IP=127.0.0.1
 export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
 export PADDLE_TRAINER_ID=0

-export PADDLE_PORT=35019
+export PADDLE_PORT=35789
 export TRAINER_PORTS_NUM=2

 distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
-CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py
+CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun

-str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
-str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
-file_0="multi_process.check_0.log"
-file_1="multi_process.check_1.log"
+str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
+str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
+file_0="multi_process_fleetrun.check_0.log"
+file_1="multi_process_fleetrun.check_1.log"

 echo "paddlecloud params test"
 if grep -q "$str1" "$file_0"; then
@@ -70,7 +69,7 @@ unset TRAINER_PORTS_NUM

 echo ""
 echo "paddle.distributed.launch async poll process test"
-if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py abort; then
+if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then
    echo "train abort as planned"
 fi


--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
@@ -40,10 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
            from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
            from paddle.fluid.incubate.fleet.base.role_maker import \
                GeneralRoleMaker
-            from paddle.fluid.incubate.fleet.utils.http_server import KVHandler
-            from paddle.fluid.incubate.fleet.utils.http_server import KVServer
-            from paddle.fluid.incubate.fleet.utils.http_server import \
-                KVHTTPServer
+            from paddle.fleet.utils import KVHandler
+            from paddle.fleet.utils import KVServer
+            from paddle.fleet.utils import KVHTTPServer
        except:
            print("warning: no fleet, skip test_pslib_4")
            return

--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import unittest
+import paddle.fleet.base.role_maker as role_maker
+
+
+class TestRoleMakerBase(unittest.TestCase):
+    """
+    Test cases for RoleMakerBase
+    """
+
+    def test_rolemaker_base(self):
+        role = role_maker.RoleMakerBase()
+        self.assertRaises(Exception, role.is_worker)
+        self.assertRaises(Exception, role.is_server)
+        self.assertRaises(Exception, role.is_first_worker)
+        self.assertRaises(Exception, role.worker_num)
+        self.assertRaises(Exception, role.server_num)
+        self.assertRaises(Exception, role.worker_index)
+        self.assertRaises(Exception, role.server_index)
+        self.assertRaises(Exception, role.role_id)
+
+        trainer_endpoints = role.get_trainer_endpoints()
+        self.assertTrue(len(trainer_endpoints) == 0)
+        pserver_endpoints = role.get_pserver_endpoints()
+        self.assertTrue(len(pserver_endpoints) == 0)
+
+        print(role.to_string())
+        self.assertTrue(role._all_gather(role._node_type_comm, 1) is None)
+        self.assertTrue(role._all_reduce(role._node_type_comm, 1) is None)
+        role._barrier(role._node_type_comm)
+
+
+class TestCloudRoleMaker(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMaker.
+    """
+
+    def setUp(self):
+        """Set up, set envs."""
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+    def test_tr_rolemaker(self):
+        """Test tr rolenamer."""
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_tr_rolemaker")
+            return
+
+        ro = role_maker.PaddleCloudRoleMaker(
+            is_collective=False, init_gloo=False)
+        self.assertTrue(ro.is_worker())
+        self.assertFalse(ro.is_server())
+        self.assertEqual(ro.worker_num(), 2)
+        self.assertTrue(ro.is_first_worker())
+        worker_endpoints = ro.get_trainer_endpoints()
+        self.assertEqual(worker_endpoints[0], '127.0.0.1:36001')
+        self.assertEqual(ro.role_id(), 0)
+
+    def test_tr_rolemaker_collective(self):
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        self.assertEqual(ro.worker_num(), 2)
+
+    def test_ps_rolemaker(self):
+        """Test ps rolemaker."""
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_ps_rolemaker")
+            return
+
+        ro = role_maker.PaddleCloudRoleMaker(
+            is_collective=False, init_gloo=False)
+        self.assertEqual(ro.server_index(), 0)
+        self.assertFalse(ro.is_worker())
+        self.assertTrue(ro.is_server())
+        self.assertEqual(ro.server_num(), 2)
+        pserver_endpoints = ro.get_pserver_endpoints()
+        self.assertEqual(pserver_endpoints[0], '127.0.0.1:36001')
+        self.assertTrue(ro._all_gather(ro._all_comm, 1) is None)
+        self.assertTrue(ro._all_reduce(ro._all_comm, 1) is None)
+
+    def test_traing_role(self):
+        """Test training role."""
+        os.environ["TRAINING_ROLE"] = "TEST"
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_training_role")
+            return
+
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro.generate_role)
+
+
+class TestUserDefinedRoleMaker(unittest.TestCase):
+    """
+    Test cases for UserDefinedRoleMaker.
+    """
+
+    def setUp(self):
+        pass
+
+    def test_ps_rolemaker(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_ps_rolemaker")
+            return
+
+        ro = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            role=role_maker.Role.SERVER,
+            current_id=0,
+            worker_num=2)
+        self.assertEqual(ro.server_num(), 2)
+        ro.generate_role()
+        self.assertTrue(ro.is_server())
+        self.assertEqual(ro.role_id(), 0)
+
+    def test_tr_rolemaker(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_tr_rolemaker")
+            return
+
+        ro = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            role=role_maker.Role.WORKER,
+            current_id=0,
+            worker_num=2)
+        self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints())
+        self.assertTrue(ro.is_worker())
+        self.assertEqual(ro.role_id(), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -12,12 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import unittest
+from __future__ import print_function
 import paddle
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import tarfile
+import tempfile
 import os
+import sys
+from paddle.dataset.common import download, DATA_HOME
+from paddle.fleet.base.util_factory import fleet_util
+import paddle.fleet.base.role_maker as role_maker


 class TestFleetUtil(unittest.TestCase):
+    proto_data_url = "https://fleet.bj.bcebos.com/fleet_util_data.tgz"
+    proto_data_md5 = "59b7f12fd9dc24b64ae8e4629523a92a"
+    module_name = "fleet_util_data"
+    pruned_dir = os.path.join("fleet_util_data", "pruned_model")
+    train_dir = os.path.join("fleet_util_data", "train_program")
+
    def test_util_base(self):
        import paddle.fleet as fleet
        util = fleet.UtilBase()
@@ -65,6 +80,262 @@ class TestFleetUtil(unittest.TestCase):
        user_id = fleet.util.get_user_id()
        self.assertEqual(user_id, 10)

+    def test_fs(self):
+        from paddle.fleet.utils import LocalFS
+        fs = LocalFS()
+        dirs, files = fs.ls_dir("test_tmp")
+        dirs, files = fs.ls_dir("./")
+        self.assertFalse(fs.need_upload_download())
+        fleet_util.set_file_system(fs)
+
+    def test_barrier(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_barrier")
+            return
+
+        gloo = fluid.core.Gloo()
+        gloo.set_rank(0)
+        gloo.set_size(1)
+        gloo.set_prefix("123")
+        gloo.set_iface("lo")
+        gloo.set_hdfs_store("./tmp_test_fleet_barrier", "", "")
+        gloo.init()
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.SERVER,
+            worker_endpoints=["127.0.0.1:6003"],
+            server_endpoints=["127.0.0.1:6001"])
+        role._node_type_comm = gloo
+        role._role_is_generated = True
+        fleet_util._set_role_maker(role)
+
+        fleet_util.barrier("worker")
+
+    def test_all_reduce(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_all_reduce")
+            return
+
+        gloo = fluid.core.Gloo()
+        gloo.set_rank(0)
+        gloo.set_size(1)
+        gloo.set_prefix("123")
+        gloo.set_iface("lo")
+        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
+        gloo.init()
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.WORKER,
+            worker_endpoints=["127.0.0.1:6003"],
+            server_endpoints=["127.0.0.1:6001"])
+        role._node_type_comm = gloo
+        role._role_is_generated = True
+        fleet_util._set_role_maker(role)
+
+        output = fleet_util.all_reduce(1, "sum", comm_world="server")
+        print(output)
+
+    # self.assertEqual(output, 1)
+
+    def test_all_gather(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_all_gather")
+            return
+
+        gloo = fluid.core.Gloo()
+        gloo.set_rank(0)
+        gloo.set_size(1)
+        gloo.set_prefix("123")
+        gloo.set_iface("lo")
+        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
+        gloo.init()
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.SERVER,
+            worker_endpoints=["127.0.0.1:6003"],
+            server_endpoints=["127.0.0.1:6001"])
+        role._node_type_comm = gloo
+        role._all_comm = gloo
+        role._role_is_generated = True
+        fleet_util._set_role_maker(role)
+
+        output = fleet_util.all_gather(1, comm_world="all")
+        print(output)
+        # self.assertTrue(len(output) == 1 and output[0] == 1)
+        self.assertRaises(Exception, fleet_util.all_gather, 1, "test")
+
+    def download_files(self):
+        path = download(self.proto_data_url, self.module_name,
+                        self.proto_data_md5)
+        print('data is downloaded at ' + path)
+        tar = tarfile.open(path)
+        unzip_folder = tempfile.mkdtemp()
+        tar.extractall(unzip_folder)
+        return unzip_folder
+
+    def test_get_file_shard(self):
+        self.assertRaises(Exception, fleet_util.get_file_shard, "files")
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_get_file_shard")
+            return
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.WORKER,
+            worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+        fleet_util._set_role_maker(role)
+        files = fleet_util.get_file_shard(["1", "2", "3"])
+        self.assertTrue(len(files) == 2 and "1" in files and "2" in files)
+
+    def test_program_type_trans(self):
+        data_dir = self.download_files()
+        program_dir = os.path.join(data_dir, self.pruned_dir)
+        text_program = "pruned_main_program.pbtxt"
+        binary_program = "pruned_main_program.bin"
+        text_to_binary = fleet_util._program_type_trans(program_dir,
+                                                        text_program, True)
+        binary_to_text = fleet_util._program_type_trans(program_dir,
+                                                        binary_program, False)
+        self.assertTrue(
+            os.path.exists(os.path.join(program_dir, text_to_binary)))
+        self.assertTrue(
+            os.path.exists(os.path.join(program_dir, binary_to_text)))
+
+    def test_prams_check(self):
+        data_dir = self.download_files()
+
+        class config:
+            pass
+
+        feed_config = config()
+        feed_config.feeded_vars_names = ['concat_1.tmp_0', 'concat_2.tmp_0']
+        feed_config.feeded_vars_dims = [682, 1199]
+        feed_config.feeded_vars_types = [np.float32, np.float32]
+        feed_config.feeded_vars_filelist = [
+            os.path.join(data_dir, os.path.join(self.pruned_dir, "concat_1")),
+            os.path.join(data_dir, os.path.join(self.pruned_dir, "concat_2"))
+        ]
+
+        fetch_config = config()
+        fetch_config.fetch_vars_names = ['similarity_norm.tmp_0']
+
+        conf = config()
+        conf.batch_size = 1
+        conf.feed_config = feed_config
+        conf.fetch_config = fetch_config
+        conf.dump_model_dir = os.path.join(data_dir, self.pruned_dir)
+        conf.dump_program_filename = "pruned_main_program.pbtxt"
+        conf.is_text_dump_program = True
+        conf.save_params_filename = None
+
+        # test saved var's shape
+        conf.dump_program_filename = "pruned_main_program.save_var_shape_not_match"
+
+        self.assertRaises(Exception, fleet_util._params_check)
+
+        # test program.proto without feed_op and fetch_op
+        conf.dump_program_filename = "pruned_main_program.no_feed_fetch"
+        results = fleet_util._params_check(conf)
+        self.assertTrue(len(results) == 1)
+        np.testing.assert_array_almost_equal(
+            results[0], np.array(
+                [[3.0590223e-07]], dtype=np.float32))
+
+        # test feed_var's shape
+        conf.dump_program_filename = "pruned_main_program.feed_var_shape_not_match"
+        self.assertRaises(Exception, fleet_util._params_check)
+
+        # test correct case with feed_vars_filelist
+        conf.dump_program_filename = "pruned_main_program.pbtxt"
+        results = fleet_util._params_check(conf)
+        self.assertTrue(len(results) == 1)
+        np.testing.assert_array_almost_equal(
+            results[0], np.array(
+                [[3.0590223e-07]], dtype=np.float32))
+
+        # test correct case without feed_vars_filelist
+        conf.feed_config.feeded_vars_filelist = None
+        # test feed var with lod_level >= 2
+        conf.dump_program_filename = "pruned_main_program.feed_lod2"
+        self.assertRaises(Exception, fleet_util._params_check)
+
+        conf.dump_program_filename = "pruned_main_program.pbtxt"
+        results = fleet_util._params_check(conf)
+        self.assertTrue(len(results) == 1)
+
+    def test_proto_check(self):
+        data_dir = self.download_files()
+
+        class config:
+            pass
+
+        conf = config()
+        conf.train_prog_path = os.path.join(
+            data_dir, os.path.join(self.train_dir, "join_main_program.pbtxt"))
+        conf.is_text_train_program = True
+
+        # test not match
+        conf.pruned_prog_path = os.path.join(
+            data_dir,
+            os.path.join(self.pruned_dir,
+                         "pruned_main_program.save_var_shape_not_match"))
+        conf.is_text_pruned_program = True
+        conf.draw = False
+        res = fleet_util._proto_check(conf)
+        self.assertFalse(res)
+
+        # test match
+        conf.pruned_prog_path = os.path.join(
+            data_dir,
+            os.path.join(self.pruned_dir, "pruned_main_program.pbtxt"))
+        if sys.platform == 'win32' or sys.platform == 'sys.platform':
+            conf.draw = False
+        else:
+            conf.draw = True
+            conf.draw_out_name = "pruned_check"
+        res = fleet_util._proto_check(conf)
+        self.assertTrue(res)
+
+    def test_visualize(self):
+        if sys.platform == 'win32' or sys.platform == 'sys.platform':
+            pass
+        else:
+            data_dir = self.download_files()
+            program_path = os.path.join(
+                data_dir,
+                os.path.join(self.train_dir, "join_main_program.pbtxt"))
+            is_text = True
+            program = fleet_util._load_program(program_path, is_text)
+            output_dir = os.path.join(data_dir, self.train_dir)
+            output_filename = "draw_prog"
+            fleet_util._visualize_graphviz(program, output_dir, output_filename)
+            self.assertTrue(
+                os.path.exists(
+                    os.path.join(output_dir, output_filename + ".dot")))
+            self.assertTrue(
+                os.path.exists(
+                    os.path.join(output_dir, output_filename + ".pdf")))
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -20,9 +20,7 @@ import os
 import sys
 import inspect

-from paddle.fluid.incubate.fleet.utils.fs import LocalFS, FS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
-from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError


 class FSTest(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_hdfs.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs.py
@@ -19,9 +19,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, T
 import os
 import sys

-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
-from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError

 java_home = os.environ["JAVA_HOME"]


--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -14,13 +14,15 @@

 from __future__ import print_function

+import os
 import unittest
 import numpy as np

 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
-from paddle.fluid.dygraph import declarative
+from paddle.fluid.dygraph import declarative, ProgramTranslator
+from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME

 BATCH_SIZE = 32
 BATCH_NUM = 20
@@ -77,8 +79,8 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):

 def train(layer):
    # create optimizer
-    adam = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.1, parameter_list=layer.parameters())
+    adam = fluid.optimizer.SGDOptimizer(
+        learning_rate=0.01, parameter_list=layer.parameters())
    # create data loader
    train_loader = fluid.io.DataLoader.from_generator(capacity=5)
    train_loader.set_batch_generator(random_batch_reader())
@@ -111,37 +113,43 @@ class TestJitSaveLoad(unittest.TestCase):
        # config seed
        fluid.default_main_program().random_seed = SEED

-    def train_and_save_model(self):
+    def train_and_save_model(self, model_path=None, configs=None):
        layer = LinearNet(784, 1)
        example_inputs, layer, _ = train(layer)
+        final_model_path = model_path if model_path else self.model_path
        orig_input_types = [type(x) for x in example_inputs]
        fluid.dygraph.jit.save(
-            layer=layer, model_path=self.model_path, input_spec=example_inputs)
+            layer=layer,
+            model_path=final_model_path,
+            input_spec=example_inputs,
+            configs=configs)
        new_input_types = [type(x) for x in example_inputs]
        self.assertEqual(orig_input_types, new_input_types)
        return layer

-    def test_save(self):
-        # train and save model
-        self.train_and_save_model()
-
-    def test_load_infernece(self):
+    def test_save_load(self):
        # train and save model
        train_layer = self.train_and_save_model()
        # load model
-        infer_layer = fluid.dygraph.jit.load(self.model_path)
+        program_translator = ProgramTranslator()
+        program_translator.enable(False)
+        loaded_layer = fluid.dygraph.jit.load(self.model_path)
+        self.load_and_inference(train_layer, loaded_layer)
+        self.load_dygraph_state_dict(train_layer)
+        self.load_and_finetune(train_layer, loaded_layer)
+        program_translator.enable(True)
+
+    def load_and_inference(self, train_layer, infer_layer):
        train_layer.eval()
+        infer_layer.eval()
        # inference & compare
        x = fluid.dygraph.to_variable(
            np.random.random((1, 784)).astype('float32'))
        self.assertTrue(
            np.array_equal(train_layer(x).numpy(), infer_layer(x).numpy()))

-    def test_load_finetune(self):
-        # train and save model
-        train_layer = self.train_and_save_model()
-        # load model
-        load_train_layer = fluid.dygraph.jit.load(self.model_path)
+    def load_and_finetune(self, train_layer, load_train_layer):
+        train_layer.train()
        load_train_layer.train()
        # train & compare
        _, _, train_loss = train(train_layer)
@@ -149,6 +157,19 @@ class TestJitSaveLoad(unittest.TestCase):
        self.assertTrue(
            np.array_equal(train_loss.numpy(), load_train_loss.numpy()))

+    def load_dygraph_state_dict(self, train_layer):
+        train_layer.eval()
+        # contruct new model
+        new_layer = LinearNet(784, 1)
+        model_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
+        new_layer.set_dict(model_dict)
+        new_layer.eval()
+        # inference & compare
+        x = fluid.dygraph.to_variable(
+            np.random.random((1, 784)).astype('float32'))
+        self.assertTrue(
+            np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
+
    def test_save_get_program_failed(self):
        layer = LinearNetNotDeclarative(784, 1)
        example_inputs, layer, _ = train(layer)
@@ -158,6 +179,31 @@ class TestJitSaveLoad(unittest.TestCase):
                model_path=self.model_path,
                input_spec=example_inputs)

+    def test_load_dygraoh_no_path(self):
+        model_path = "model.test_jit_save_load.no_path"
+        new_layer = LinearNet(784, 1)
+        with self.assertRaises(ValueError):
+            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
+
+    def test_load_dygraph_no_var_info(self):
+        model_path = "model.test_jit_save_load.no_var_info"
+        self.train_and_save_model(model_path=model_path)
+        # remove `__variables.info__`
+        var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        os.remove(var_info_path)
+        new_layer = LinearNet(784, 1)
+        with self.assertRaises(RuntimeError):
+            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
+
+    def test_load_dygraph_not_var_file(self):
+        model_path = "model.test_jit_save_load.no_var_file"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        configs.params_filename = "__params__"
+        self.train_and_save_model(model_path=model_path, configs=configs)
+        new_layer = LinearNet(784, 1)
+        with self.assertRaises(RuntimeError):
+            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
+

 class TestJitSaveLoadConfig(unittest.TestCase):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@@ -3,7 +3,7 @@ set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
 launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
-python ${launch_py} multi_process.py
+python ${launch_py} multi_process.py launch

 # use paddlecloud
 echo "begin test use paddlecloud"
@@ -18,12 +18,12 @@ export PADDLE_PORT=35019
 export TRAINER_PORTS_NUM=2

 distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
-CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py
+CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch

 str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
 str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
-file_0="multi_process.check_0.log"
-file_1="multi_process.check_1.log"
+file_0="multi_process_launch.check_0.log"
+file_1="multi_process_launch.check_1.log"

 echo "paddlecloud params test"
 if grep -q "$str1" "$file_0"; then
@@ -54,7 +54,7 @@ unset TRAINER_PORTS_NUM

 echo ""
 echo "paddle.distributed.launch async poll process test"
-if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py abort; then
+if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch abort; then
    echo "train abort as planned"
 fi


--- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
@@ -63,7 +63,7 @@ def case_generator(op_type, Xshape, diagonal, expected):
        "diagonal: TypeError":
        "diagonal in {} must be a python Int".format(op_type),
        "input: ValueError":
-        "input shape in {} must be at least 2-D".format(op_type),
+        "x shape in {} must be at least 2-D".format(op_type),
    }

    class FailureCase(unittest.TestCase):
@@ -71,7 +71,7 @@ def case_generator(op_type, Xshape, diagonal, expected):
            data = fluid.data(shape=Xshape, dtype='float64', name=cls_name)
            with self.assertRaisesRegexp(
                    eval(expected.split(':')[-1]), errmsg[expected]):
-                getattr(tensor, op_type)(input=data, diagonal=diagonal)
+                getattr(tensor, op_type)(x=data, diagonal=diagonal)

    class SuccessCase(TrilTriuOpDefaultTest):
        def initTestCase(self):

--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -81,7 +81,7 @@ class API_TestUnsqueeze(unittest.TestCase):
    def test_out(self):
        with fluid.program_guard(fluid.Program(), fluid.Program()):
            data1 = fluid.layers.data('data1', shape=[-1, 10], dtype='float64')
-            result_squeeze = paddle.unsqueeze(data1, axes=[1])
+            result_squeeze = paddle.unsqueeze(data1, axis=[1])
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            input1 = np.random.random([5, 1, 10]).astype('float64')
@@ -98,7 +98,7 @@ class TestUnsqueezeOpError(unittest.TestCase):
            def test_axes_type():
                x6 = fluid.layers.data(
                    shape=[-1, 10], dtype='float16', name='x3')
-                paddle.unsqueeze(x6, axes=3.2)
+                paddle.unsqueeze(x6, axis=3.2)

            self.assertRaises(TypeError, test_axes_type)

@@ -108,7 +108,7 @@ class API_TestUnsqueeze2(unittest.TestCase):
        with fluid.program_guard(fluid.Program(), fluid.Program()):
            data1 = fluid.data('data1', shape=[-1, 10], dtype='float64')
            data2 = fluid.data('data2', shape=[1], dtype='int32')
-            result_squeeze = paddle.unsqueeze(data1, axes=data2)
+            result_squeeze = paddle.unsqueeze(data1, axis=data2)
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            input1 = np.random.random([5, 1, 10]).astype('float64')
@@ -125,7 +125,7 @@ class API_TestUnsqueeze3(unittest.TestCase):
        with fluid.program_guard(fluid.Program(), fluid.Program()):
            data1 = fluid.data('data1', shape=[-1, 10], dtype='float64')
            data2 = fluid.data('data2', shape=[1], dtype='int32')
-            result_squeeze = paddle.unsqueeze(data1, axes=[data2, 3])
+            result_squeeze = paddle.unsqueeze(data1, axis=[data2, 3])
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            input1 = np.random.random([5, 1, 10, 1]).astype('float64')
@@ -143,7 +143,7 @@ class API_TestDyUnsqueeze(unittest.TestCase):
            input_1 = np.random.random([5, 1, 10]).astype("int32")
            input1 = np.squeeze(input_1, axis=1)
            input = fluid.dygraph.to_variable(input_1)
-            output = paddle.unsqueeze(input, axes=[1])
+            output = paddle.unsqueeze(input, axis=[1])
            out_np = output.numpy()
            self.assertTrue(np.allclose(input1, out_np))

@@ -154,7 +154,7 @@ class API_TestDyUnsqueeze2(unittest.TestCase):
            input_1 = np.random.random([5, 1, 10]).astype("int32")
            input1 = np.squeeze(input_1, axis=1)
            input = fluid.dygraph.to_variable(input_1)
-            output = paddle.unsqueeze(input, axes=1)
+            output = paddle.unsqueeze(input, axis=1)
            out_np = output.numpy()
            self.assertTrue(np.allclose(input1, out_np))


--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -248,7 +248,7 @@ def zeros(shape, dtype=None, name=None):
          
          # shape is a Tensor
          shape = paddle.fill_constant(shape=[2], dtype='int32', value=2)
-          data3 = paddle.ones(shape=shape, dtype='int32') 
+          data3 = paddle.zeros(shape=shape, dtype='int32') 
          # [[0 0]
          #  [0 0]]
    """
@@ -490,14 +490,13 @@ def _tril_triu_op(helper):
    """Base op of tril_op and triu_op
    """
    op_type = helper.layer_type
-    x = helper.kwargs.get('input', None)
+    x = helper.kwargs.get('x', None)

    assert x is not None, 'x cannot be None in {}'.format(op_type)
    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
                             op_type)
    if len(x.shape) < 2:
-        raise ValueError("input shape in {} must be at least 2-D".format(
-            op_type))
+        raise ValueError("x shape in {} must be at least 2-D".format(op_type))
    diagonal = helper.kwargs.get('diagonal', 0)
    if not isinstance(diagonal, (int, )):
        raise TypeError("diagonal in {} must be a python Int".format(op_type))
@@ -521,18 +520,18 @@ def _tril_triu_op(helper):
    return out


-def tril(input, diagonal=0, name=None):
+def tril(x, diagonal=0, name=None):
    """
 	:alias_main: paddle.tril
 	:alias: paddle.tril,paddle.tensor.tril,paddle.tensor.creation.tril

    This op returns the lower triangular part of a matrix (2-D tensor) or batch
-    of matrices :attr:`input`, the other elements of the result tensor are set 
+    of matrices :attr:`x`, the other elements of the result tensor are set 
    to 0. The lower triangular part of the matrix is defined as the elements 
    on and below the diagonal.

    Args:
-        input (Variable): The input variable which is a Tensor.
+        x (Variable): The input variable x which is a Tensor.
            Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
        diagonal (int, optional): The diagonal to consider, default value is 0.
            If :attr:`diagonal` = 0, all elements on and below the main diagonal are
@@ -545,47 +544,41 @@ def tril(input, diagonal=0, name=None):
            user to set this property. For more information, please refer to :ref:`api_guide_Name`.

    Returns:
-        Variable: Tensor, results of lower triangular operation by the specified diagonal of input tensor,
-        it's data type is the same as input's Tensor.
+        Variable: Tensor, results of lower triangular operation by the specified diagonal of input tensor x,
+        it's data type is the same as x's Tensor.

    Raises:
        TypeError: diagonal is not a int type.
-        ValueError: dimension of :attr:`input` is less than 2.
+        ValueError: dimension of :attr:`x` is less than 2.

    Examples:
        .. code-block:: python

            import numpy as np
-            import paddle.tensor as tensor
-            import paddle.fluid as fluid
+            import paddle

            data = np.arange(1, 13, dtype="int64").reshape(3,-1)
            # array([[ 1,  2,  3,  4],
            #        [ 5,  6,  7,  8],
            #        [ 9, 10, 11, 12]])
-            x = fluid.data(shape=(-1, 4), dtype='int64', name='x')
-            exe = fluid.Executor(fluid.CPUPlace())

-            # example 1, default diagonal
-            tril = tensor.tril(x)
-            tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[tril], return_numpy=True)
+            paddle.enable_imperative()
+
+            x = paddle.imperative.to_variable(data)
+            
+            tril1 = paddle.tensor.tril(x)
            # array([[ 1,  0,  0,  0],
            #        [ 5,  6,  0,  0],
            #        [ 9, 10, 11,  0]])

            # example 2, positive diagonal value
-            tril = tensor.tril(x, diagonal=2)
-            tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[tril], return_numpy=True)
+            tril2 = paddle.tensor.tril(x, diagonal=2)
            # array([[ 1,  2,  3,  0], 
            #        [ 5,  6,  7,  8],
            #        [ 9, 10, 11, 12]])

            # example 3, negative diagonal value
-            tril = tensor.tril(x, diagonal=-1)
-            tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[tril], return_numpy=True)
+            tril3 = paddle.tensor.tril(x, diagonal=-1)
            # array([[ 0,  0,  0,  0],
            #        [ 5,  0,  0,  0],
            #        [ 9, 10,  0,  0]])
@@ -593,23 +586,23 @@ def tril(input, diagonal=0, name=None):
    """
    if in_dygraph_mode():
        op = getattr(core.ops, 'tril_triu')
-        return op(input, 'diagonal', diagonal, "lower", True)
+        return op(x, 'diagonal', diagonal, "lower", True)

    return _tril_triu_op(LayerHelper('tril', **locals()))


-def triu(input, diagonal=0, name=None):
+def triu(x, diagonal=0, name=None):
    """
 	:alias_main: paddle.triu
 	:alias: paddle.triu,paddle.tensor.triu,paddle.tensor.creation.triu

    This op returns the upper triangular part of a matrix (2-D tensor) or batch of matrices
-    :attr:`input`, the other elements of the result tensor are set to 0.
+    :attr:`x`, the other elements of the result tensor are set to 0.
    The upper triangular part of the matrix is defined as the elements on and
    above the diagonal.

    Args:
-        input (Variable): The input variable which is a Tensor.
+        x (Variable): The input variable x which is a Tensor.
            Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
        diagonal (int, optional): The diagonal to consider, default value is 0.
            If :attr:`diagonal` = 0, all elements on and above the main diagonal are
@@ -622,47 +615,41 @@ def triu(input, diagonal=0, name=None):
            user to set this property. For more information, please refer to :ref:`api_guide_Name`.

    Returns:
-        Variable: Tensor, results of upper triangular operation by the specified diagonal of input tensor,
-        it's data type is the same as input's Tensor.
+        Variable: Tensor, results of upper triangular operation by the specified diagonal of input tensor x,
+        it's data type is the same as x's Tensor.

    Raises:
        TypeError: diagonal is not a int type.
-        ValueError: dimension of :attr:`input` is less than 2.
+        ValueError: dimension of :attr:`x` is less than 2.

    Examples:
        .. code-block:: python

            import numpy as np
-            import paddle.fluid as fluid
-            import paddle.tensor as tensor
+            import paddle

            data = np.arange(1, 13, dtype="int64").reshape(3,-1)
            # array([[ 1,  2,  3,  4],
            #        [ 5,  6,  7,  8],
            #        [ 9, 10, 11, 12]])
-            x = fluid.data(shape=(-1, 4), dtype='int64', name='x')
-            exe = fluid.Executor(fluid.CPUPlace())
+
+            paddle.enable_imperative()

            # example 1, default diagonal
-            triu = tensor.triu(x)
-            triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[triu], return_numpy=True)
+            x = paddle.imperative.to_variable(data)
+            triu1 = paddle.tensor.triu(x)
            # array([[ 1,  2,  3,  4],
            #        [ 0,  6,  7,  8],
            #        [ 0,  0, 11, 12]])

            # example 2, positive diagonal value
-            triu = tensor.triu(x, diagonal=2)
-            triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[triu], return_numpy=True)
+            triu2 = paddle.tensor.triu(x, diagonal=2)
            # array([[0, 0, 3, 4],
            #        [0, 0, 0, 8],
            #        [0, 0, 0, 0]])

            # example 3, negative diagonal value
-            triu = tensor.triu(x, diagonal=-1)
-            triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[triu], return_numpy=True)
+            triu3 = paddle.tensor.triu(x, diagonal=-1)
            # array([[ 1,  2,  3,  4],
            #        [ 5,  6,  7,  8],
            #        [ 0, 10, 11, 12]])
@@ -670,7 +657,7 @@ def triu(input, diagonal=0, name=None):
    """
    if in_dygraph_mode():
        op = getattr(core.ops, 'tril_triu')
-        return op(input, 'diagonal', diagonal, "lower", False)
+        return op(x, 'diagonal', diagonal, "lower", False)

    return _tril_triu_op(LayerHelper('triu', **locals()))


--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -729,26 +729,32 @@ def bmm(x, y, name=None):

    Examples:
        import paddle
-        import paddle.fluid as fluid
-        x = fluid.layers.data(name='x', shape=[10, 3, 4], dtype='float32')
-        y = fluid.layers.data(name='y', shape=[10, 4, 5], dtype='float32')
-        out = paddle.bmm(x, y)
-    
-        # In dygraph mode:
+
+        # In imperative mode:
        # size input1: (2, 2, 3) and input2: (2, 3, 2)
        input1 = np.array([[[1.0, 1.0, 1.0],[2.0, 2.0, 2.0]],[[3.0, 3.0, 3.0],[4.0, 4.0, 4.0]]])
        input2 = np.array([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],[[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])

-        with fluid.dygraph.guard():
-            x = fluid.dygraph.to_variable(input1)
-            y = fluid.dygraph.to_variable(input2)
-            out = paddle.bmm(x, y)
-            #output size: (2, 2, 2)
-            #output value:
-            #[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]]
-            out_np = out.numpy()
+        paddle.enable_imperative()
+        
+        x = paddle.imperative.to_variable(input1)
+        y = paddle.imperative.to_variable(input2)
+        out = paddle.bmm(x, y)
+        #output size: (2, 2, 2)
+        #output value:
+        #[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]]
+        out_np = out.numpy()
    """
-
+    x_shape = x.shape
+    y_shape = y.shape
+    if not len(x_shape) == len(y_shape) == 3:
+        raise ValueError(
+            "x and y should be 3-dimensional. But received x's dimention: {}, y's dimention: {}".
+            format(x_shape, y_shape))
+    if x_shape[2] != y_shape[1]:
+        raise ValueError(
+            "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}".
+            format(x_shape, y_shape))
    helper = LayerHelper('bmm', **locals())
    if in_dygraph_mode():
        return core.ops.bmm(x, y)

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -42,11 +42,32 @@ from ..fluid import layers
 import paddle

 __all__ = [
-    'cast', 'concat', 'expand', 'expand_as', 'flatten', 'gather', 'gather_nd',
-    'reshape', 'reverse', 'scatter', 'scatter_nd_add', 'scatter_nd',
-    'shard_index', 'slice', 'split', 'squeeze', 'stack', 'strided_slice',
-    'transpose', 'unique', 'unique_with_counts', 'unsqueeze', 'unstack', 'flip',
-    'unbind', 'roll'
+    'cast',
+    'concat',
+    'expand',
+    'expand_as',
+    'flatten',
+    'gather',
+    'gather_nd',
+    'reshape',
+    'reverse',
+    'scatter',
+    'scatter_nd_add',
+    'scatter_nd',
+    'shard_index',
+    'slice',
+    'split',
+    'squeeze',
+    'stack',
+    'strided_slice',
+    'transpose',
+    'unique',
+    'unique_with_counts',
+    'unsqueeze',
+    'unstack',
+    'flip',
+    'unbind',
+    'roll',
 ]


@@ -417,7 +438,7 @@ def stack(x, axis=0, name=None):
    Args:
        x (Tensor|list[Tensor]): Input ``x`` can be a single tensor, or a ``list`` of tensors.
                                     If ``x`` is a ``list``, the Tensors in ``x``
-                                     must be of the same shape and dtype. Support data types: float32, float64, int32, int64.
+                                     must be of the same shape and dtype. Supported data types: float32, float64, int32, int64.
        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
                              If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
@@ -559,18 +580,19 @@ def squeeze(x, axis=None, name=None):
            out.shape = [1, 3, 5]

    Args:
-        input (Tensor): The input Tensor. Support data type: float32, float64, int8, int32, int64.
+        x (Tensor): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64.
        axis (int|list|tuple, optional): An integer or list of integers, indicating the dimensions to be squeezed. Default is None.
-                          The range of axis is :math:`[-ndim(input), ndim(input))`.
-                          If axis is negative, :math:`axis = axis + ndim(input)`.
-                          If axis is None, all the dimensions of input of size 1 will be removed.
+                          The range of axis is :math:`[-ndim(x), ndim(x))`.
+                          If axis is negative, :math:`axis = axis + ndim(x)`.
+                          If axis is None, all the dimensions of x of size 1 will be removed.
        name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.

    Returns:
-        Tensor: Output squeezed Tensor. Data type is same as input Tensor.
+        Tensor: Squeezed Tensor with the same data type as input Tensor.

    Examples:
        .. code-block:: python
+
            import paddle

            paddle.enable_imperative()
@@ -590,87 +612,50 @@ def squeeze(x, axis=None, name=None):
    return layers.squeeze(x, axis, name)


-def unsqueeze(input, axes, out=None, name=None):
+def unsqueeze(x, axis, name=None):
    """
 	:alias_main: paddle.unsqueeze
-	:alias: paddle.unsqueeze,paddle.tensor.unsqueeze,paddle.tensor.manipulation.unsqueeze
-
-    Insert single-dimensional entries to the shape of a Tensor. Takes one
-    required argument axes, a list of dimensions that will be inserted.
-    Dimension indices in axes are as seen in the output tensor.
-
-    For example:
-
-    .. code-block:: text
+	:alias: paddle.unsqueeze, paddle.tensor.unsqueeze, paddle.tensor.manipulation.unsqueeze

-      Given a tensor such that tensor with shape [3, 4, 5],
-      then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
+    Insert single-dimensional entries to the shape of input Tensor ``x``. Takes one
+    required argument axis, a dimension or list of dimensions that will be inserted.
+    Dimension indices in axis are as seen in the output tensor.

    Args:
-        input (Variable): The input Tensor to be unsqueezed. It is a N-D Tensor of data types float32, float64, int32.
-        axes (int|list|tuple|Variable): Indicates the dimensions to be inserted. The data type is ``int32`` . If ``axes`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. If ``axes`` is an Variable, it should be an 1-D Tensor .
-        name (str|None): Name for this layer.
+        x (Tensor): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64.
+        axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` . 
+                                    If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. 
+                                    If ``axis`` is a Tensor, it should be an 1-D Tensor .
+                                    If ``axis`` is negative, ``axis = axis + ndim(x) + 1``.
+        name (str|None): Name for this layer. Please refer to :ref:`api_guide_Name`, Default None.

    Returns:
-        Variable: Output unsqueezed Tensor, with data type being float32, float64, int32, int64.
+        Tensor: Unsqueezed Tensor with the same data type as input Tensor.

    Examples:
        .. code-block:: python
-            import numpy as np
+
            import paddle
-            import paddle.fluid as fluid

-            with fluid.dygraph.guard():
-                input_1 = np.random.random([5, 10]).astype("int32")
-                # input is a variable which shape is [5, 10]
-                input = fluid.dygraph.to_variable(input_1)
+            paddle.enable_imperative()
+            x = paddle.rand([5, 10])
+            print(x.shape)  # [5, 10]
+            
+            out1 = paddle.unsqueeze(x, axis=0)
+            print(out1.shape)  # [1, 5, 10]
+            
+            out2 = paddle.unsqueeze(x, axis=[0, 2]) 
+            print(out2.shape)  # [1, 5, 1, 10]

-                output = paddle.unsqueeze(input, axes=[1])
-                # output.shape [5, 1, 10]
+            axis = paddle.fluid.dygraph.to_variable([0, 1, 2])
+            out3 = paddle.unsqueeze(x, axis=axis) 
+            print(out3.shape)  # [1, 1, 1, 5, 10]
+            
    """
-    if not isinstance(axes, (int, list, tuple, Variable)):
-        raise TypeError(
-            "The type of 'axes' in unsqueeze must be int, list, tuple or Variable, but "
-            "received %s." % (type(axes)))
-    helper = LayerHelper("unsqueeze2", **locals())
-    inputs = {"X": input}
-    attrs = {}
-
-    def _to_Variable_list(one_list):
-        Variable_list = []
-        for ele in one_list:
-            if isinstance(ele, Variable):
-                ele.stop_gradient = True
-                Variable_list.append(ele)
-            else:
-                assert (isinstance(ele, int))
-                temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1], 'int32', ele, force_cpu=True, out=temp_out)
-                Variable_list.append(temp_out)
-        return Variable_list
-
-    if isinstance(axes, int):
-        axes = [axes]
-    if isinstance(axes, Variable):
-        axes.stop_gradient = True
-        inputs["AxesTensor"] = axes
-    elif isinstance(axes, (list, tuple)):
-        contain_var = not all(not isinstance(ele, Variable) for ele in axes)
-        if contain_var:
-            inputs["AxesTensorList"] = _to_Variable_list(axes)
-        else:
-            attrs["axes"] = axes
-
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type="unsqueeze2",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={"Out": out,
-                 "XShape": x_shape})
+    if isinstance(axis, int):
+        axis = [axis]

-    return out
+    return layers.unsqueeze(x, axis, name)


 def gather(input, index, overwrite=True):

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -915,7 +915,7 @@ def mm(input, mat2, name=None):
    return out


-def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
+def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
    """
 	:alias_main: paddle.addmm
 	:alias: paddle.addmm,paddle.tensor.addmm,paddle.tensor.math.addmm
@@ -935,8 +935,8 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
        input (Variable): The input Tensor/LoDTensor to be added to the final result.
        x (Variable): The first input Tensor/LoDTensor for matrix multiplication.
        y (Variable): The second input Tensor/LoDTensor for matrix multiplication.
-        alpha (float): Coefficient of $x*y$.
        beta (float): Coefficient of $input$.
+        alpha (float): Coefficient of $x*y$.
        name (str, optional): Name of the output. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default is None.

    Returns:
@@ -947,25 +947,43 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):

            import numpy as np
            import paddle
-            import paddle.fluid as fluid
-
-            input = fluid.data(name='input', shape=[2, 2], dtype='float32')
-            x = fluid.data(name='x', shape=[2, 2], dtype='float32')
-            y = fluid.data(name='y', shape=[2, 2], dtype='float32')
-            out = paddle.addmm( input=input, x=x, y=y, alpha=5.0, beta=0.5 )

            data_x = np.ones((2, 2)).astype(np.float32)
            data_y = np.ones((2, 2)).astype(np.float32)
            data_input = np.ones((2, 2)).astype(np.float32)

-            place =  fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            results = exe.run(fluid.default_main_program(),
-                              fetch_list=[out], feed={"input": data_input, 'x': data_x, "y": data_y})
-            print( np.array(results[0]) )
+            paddle.enable_imperative()
+
+            x = paddle.imperative.to_variable(data_x)
+            y = paddle.imperative.to_variable(data_y)
+            input = paddle.imperative.to_variable(data_input)
+
+            out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
+
+            print( out.numpy() )
            # [[10.5 10.5]
            # [10.5 10.5]]
    """
+    input_shape = input.shape
+    x_shape = x.shape
+    y_shape = y.shape
+    if not len(input_shape) == len(x_shape) == len(y_shape) == 2:
+        raise ValueError("The dimention of input, x, y should be 2 but receive input's shape: {}, x's shape: {}, y's shape: {}".format(input_shape, x_shape, y_shape))
+    if input_shape[0] != x_shape[0]:
+        if input_shape[0] != 1:
+            raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
+        if input_shape[1] != y_shape[1] and input_shape[1] != 1:
+            raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
+    if input_shape[1] != y_shape[1]:
+        if input_shape[1] != 1:
+            raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
+        if input_shape[0] != x_shape[0] and input_shape[0] != 1:
+            raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
+    if x_shape[1] != y_shape[0]:
+        raise ValueError("The input Variable x's width must be equal with Variable y' height. But received x's shape = {}, y's shape = {}.".format(x_shape, y_shape))
+
+
+
    if in_dygraph_mode():
        out = core.ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta)
        return out
@@ -974,7 +992,7 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
    attrs = {'Alpha': alpha, 'Beta': beta}

    helper = LayerHelper("addmm", **locals())
-    check_variable_and_dtype(x, 'Input', ['float32', 'float64'], 'addmm')
+    check_variable_and_dtype(input, 'Input', ['float32', 'float64'], 'addmm')
    check_variable_and_dtype(x, 'X', ['float32', 'float64'], 'addmm')
    check_variable_and_dtype(y, 'Y', ['float32', 'float64'], 'addmm')
    out = helper.create_variable_for_type_inference(dtype=x.dtype)

--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -21,3 +21,4 @@ prettytable
 objgraph
 astor
 pathlib
+netifaces
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -152,6 +152,7 @@ packages=['paddle',
          'paddle.fleet.dataset',
          'paddle.fleet.metrics',
          'paddle.fleet.proto',
+          'paddle.fleet.utils',
          'paddle.framework',
          'paddle.fluid',
          'paddle.fluid.dygraph',