diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index 079fb1479861ca0840b47470339f2f7a5b6bffa8..b50b4f37caecd8d8d5c393ee3a5c5b76c1f406be 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
+#include <cmath>
 #include <functional>
 #include <string>
 #include <vector>
@@ -74,12 +75,17 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
   auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
   auto weights_shape = weights->dims();
   auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
+  auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
 
-  EigenMatrixArrayMap weights_array_2d(
-      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
-      weights_shape_2d[1]);
+  EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
+                                       weights_shape_2d[1]);
 
   weights_array_2d.colwise() *= scale_array;
+
+  // Check for subnormal values that slows down convolution execution
+  for (int i = 0; i < weights->numel(); ++i) {
+    if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
+  }
 }
 
 void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
@@ -108,13 +114,6 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
 
     GET_CONV_BN_NODES(conv_ac_pattern);
 
-    // check if fuse can be done and if MKL-DNN should be used
-    FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
-    if (fuse_option == DO_NOT_FUSE) {
-      VLOG(3) << "do not perform conv+affinechannel fuse";
-      return;
-    }
-
     // Create eltwise_y (conv bias) variable
     VarDesc eltwise_y_in_desc(
         patterns::PDNodeName(name_scope_, "eltwise_y_in"));
@@ -143,6 +142,7 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
     desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
     desc.SetType("elementwise_add");
     desc.SetAttr("axis", 1);
+    desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
     auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
 
     GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index a1c1e6de5fd44617a30f235a0416d897bf932075..9fbc97d55090345af3b3b12bcd138bfaecd346cc 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
@@ -103,8 +102,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
                                   // params_file_ fields.
 
   CP_MEMBER(opt_cache_dir_);
-  prog_file_ = std::move(other.prog_file_);
-  params_file_ = std::move(other.params_file_);
+  CP_MEMBER(prog_file_);
+  CP_MEMBER(params_file_);
 
   CP_MEMBER(use_fc_padding_);
   // GPU related.
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index a8c8058c6b714dcd6f283c35b50bef55446e62bb..127a41aee890808258367fb40804a9547b8fdbb0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -32,7 +32,6 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -517,6 +516,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
+  // TODO(NHZlX): Should add the link to the doc of
+  // paddle_infer::CreatePredictor<paddle_infer::Config>
   if (config.glog_info_disabled()) {
     FLAGS_logtostderr = 1;
     FLAGS_minloglevel = 2;  // GLOG_ERROR
@@ -1058,3 +1059,122 @@ USE_TRT_CONVERTER(skip_layernorm);
 USE_TRT_CONVERTER(slice);
 USE_TRT_CONVERTER(scale);
 #endif
+
+namespace paddle_infer {
+
+void Tensor::Reshape(const std::vector<int> &shape) { tensor_->Reshape(shape); }
+
+std::vector<int> Tensor::shape() const { return tensor_->shape(); }
+
+void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+  return tensor_->SetLoD(x);
+}
+
+std::vector<std::vector<size_t>> Tensor::lod() const { return tensor_->lod(); }
+
+const std::string &Tensor::name() const { return tensor_->name(); }
+
+DataType Tensor::type() const { return tensor_->type(); }
+
+Predictor::Predictor(const Config &config) {
+  const_cast<Config *>(&config)->SwitchUseFeedFetchOps(false);
+  // The second parameter indicates that the discard log is not printed
+  predictor_ = paddle::CreatePaddlePredictor<
+      Config, paddle::PaddleEngineKind::kAnalysis>(config);
+}
+
+std::vector<std::string> Predictor::GetInputNames() {
+  return predictor_->GetInputNames();
+}
+
+std::unique_ptr<Tensor> Predictor::GetInputHandle(const std::string &name) {
+  auto zero_copy_tensor = predictor_->GetInputTensor(name);
+  std::unique_ptr<Tensor> tensor(new Tensor(std::move(zero_copy_tensor)));
+  return tensor;
+}
+
+std::vector<std::string> Predictor::GetOutputNames() {
+  return predictor_->GetOutputNames();
+}
+
+std::unique_ptr<Tensor> Predictor::GetOutputHandle(const std::string &name) {
+  auto zero_copy_tensor = predictor_->GetOutputTensor(name);
+  std::unique_ptr<Tensor> tensor(new Tensor(std::move(zero_copy_tensor)));
+  return tensor;
+}
+
+bool Predictor::Run() { return predictor_->ZeroCopyRun(); }
+
+std::unique_ptr<Predictor> Predictor::Clone() {
+  auto analysis_pred = predictor_->Clone();
+  std::unique_ptr<Predictor> pred(new Predictor(std::move(analysis_pred)));
+  return pred;
+}
+
+void Predictor::ClearIntermediateTensor() {
+  predictor_->ClearIntermediateTensor();
+}
+
+int GetNumBytesOfDataType(DataType dtype) {
+  switch (dtype) {
+    case DataType::FLOAT32:
+      return sizeof(float);
+    case DataType::INT64:
+      return sizeof(int64_t);
+    case DataType::INT32:
+      return sizeof(int32_t);
+    case DataType::UINT8:
+      return sizeof(uint8_t);
+    default:
+      assert(false);
+      return -1;
+  }
+}
+
+std::string GetVersion() { return paddle::get_version(); }
+
+std::string UpdateDllFlag(const char *name, const char *value) {
+  return paddle::UpdateDllFlag(name, value);
+}
+
+}  // namespace paddle_infer
+
+namespace paddle_infer {
+std::shared_ptr<Predictor> CreatePredictor(const Config &config) {  // NOLINT
+  std::shared_ptr<Predictor> predictor(new Predictor(config));
+  return predictor;
+}
+
+namespace services {
+PredictorPool::PredictorPool(const Config &config, size_t size) {
+  PADDLE_ENFORCE_GE(
+      size, 1UL,
+      paddle::platform::errors::InvalidArgument(
+          "The predictor pool size should be greater than 1, but it's (%d)",
+          size));
+  Config copy_config(config);
+  main_pred_.reset(new Predictor(config));
+  for (size_t i = 0; i < size - 1; i++) {
+    if (config.tensorrt_engine_enabled()) {
+      Config config_tmp(copy_config);
+      preds_.push_back(
+          std::move(std::unique_ptr<Predictor>(new Predictor(config_tmp))));
+    } else {
+      preds_.push_back(std::move(main_pred_->Clone()));
+    }
+  }
+}
+
+Predictor *PredictorPool::Retrive(size_t idx) {
+  PADDLE_ENFORCE_LT(
+      idx, preds_.size() + 1,
+      paddle::platform::errors::InvalidArgument(
+          "There are (%d) predictors in the pool, but the idx is (%d)", idx,
+          preds_.size() + 1));
+  if (idx == 0) {
+    return main_pred_.get();
+  }
+  return preds_[idx - 1].get();
+}
+}  // namespace services
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 458eecfeea6ff27c96a8864ba8a08a9e5c587df5..2f608da531f25e1a5665744f7e9a2968cc9d0d64 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -112,6 +112,12 @@ void PaddleBuf::Free() {
   }
 }
 
+NativeConfig::NativeConfig() {
+  LOG(WARNING) << "The paddle::NativeConfig interface is going to be "
+                  "deprecated in the next release, plase use the latest "
+                  "paddle_infer::Config instead.";
+}
+
 std::string get_version() {
   std::stringstream ss;
   ss << "version: " << framework::paddle_version() << "\n";
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 3d5b40c93dad071052217677e387ba54011fb666..07d6dcf86e9814e5bfc932d8320b549d55fe88ae 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <set>
 #include <sstream>
 #include <string>
@@ -25,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -311,6 +313,8 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
+  // TODO(NHZlX): Should add the link to the doc of
+  // paddle_infer::CreatePredictor<paddle_infer::Config>
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memory
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 386d20103a71acb34cd47ddf5527f580cc5bf5b1..064f63542683a0d95985382385b182d794da0068 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -347,6 +347,7 @@ class PD_INFER_DECL PaddlePredictor {
 /// place of inference, etc.)
 ///
 struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
+  NativeConfig();
   /// GPU related fields.
   bool use_gpu{false};
   int device{0};
@@ -421,7 +422,8 @@ enum class PaddleEngineKind {
 };
 
 template <typename ConfigT, PaddleEngineKind engine>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
+    const ConfigT& config);
 
 template <>
 PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
@@ -437,6 +439,4 @@ PD_INFER_DECL std::string get_version();
 
 PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
 
-PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
-    const std::string& config_file);
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 6f30ad95f168cebe9702c90fbd2cca2c79a0e83f..da5d7411693c92eaa2066c7f76d56970f8939bc7 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -22,9 +22,124 @@ limitations under the License. */
 #pragma once
 
 #include <cassert>
+#include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "paddle_analysis_config.h"  // NOLINT
 #include "paddle_api.h"              // NOLINT
+
+namespace paddle_infer {
+using DataType = paddle::PaddleDType;
+using PlaceType = paddle::PaddlePlace;
+using PrecisionType = paddle::AnalysisConfig::Precision;
+using Config = paddle::AnalysisConfig;
+
+class PD_INFER_DECL Tensor {
+ public:
+  // Can only be created by predictor->GetInputHandle(cosnt std::string& name)
+  // or predictor->GetOutputHandle(cosnt std::string& name)
+  Tensor() = delete;
+  explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
+      : tensor_(std::move(tensor)) {}
+  void Reshape(const std::vector<int>& shape);
+
+  template <typename T>
+  void CopyFromCpu(const T* data);
+
+  // should add the place
+  template <typename T>
+  T* mutable_data(PlaceType place);
+
+  template <typename T>
+  void CopyToCpu(T* data);
+
+  template <typename T>
+  T* data(PlaceType* place, int* size) const;
+
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  std::vector<std::vector<size_t>> lod() const;
+
+  DataType type() const;
+
+  std::vector<int> shape() const;
+  const std::string& name() const;
+
+ private:
+  std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
+};
+
+class PD_INFER_DECL Predictor {
+ public:
+  Predictor() = default;
+  ~Predictor() {}
+  // Use for clone
+  explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
+      : predictor_(std::move(pred)) {}
+
+  explicit Predictor(const Config& config);
+
+  std::vector<std::string> GetInputNames();
+  std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
+
+  bool Run();
+
+  std::vector<std::string> GetOutputNames();
+  std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
+
+  std::unique_ptr<Predictor> Clone();
+  void ClearIntermediateTensor();
+
+ private:
+  std::unique_ptr<paddle::PaddlePredictor> predictor_;
+};
+
+PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
+    const Config& config);  // NOLINT
+PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
+
+PD_INFER_DECL std::string GetVersion();
+PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
+
+template <typename T>
+void Tensor::CopyFromCpu(const T* data) {
+  tensor_->copy_from_cpu<T>(data);
+}
+
+template <typename T>
+void Tensor::CopyToCpu(T* data) {
+  return tensor_->copy_to_cpu<T>(data);
+}
+
+template <typename T>
+T* Tensor::mutable_data(PlaceType place) {
+  return tensor_->mutable_data<T>(place);
+}
+
+template <typename T>
+T* Tensor::data(PlaceType* place, int* size) const {
+  return tensor_->data<T>(place, size);
+}
+
+}  // namespace paddle_infer
+
+namespace paddle_infer {
+namespace services {
+
+class PD_INFER_DECL PredictorPool {
+ public:
+  PredictorPool() = delete;
+  PredictorPool(const PredictorPool&) = delete;
+  PredictorPool& operator=(const PredictorPool&) = delete;
+
+  explicit PredictorPool(const Config& config, size_t size = 1);
+  Predictor* Retrive(size_t idx);
+
+ private:
+  std::shared_ptr<Predictor> main_pred_;
+  std::vector<std::unique_ptr<Predictor>> preds_;
+};
+}  // namespace services
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index ffb70700b5f98a51b579a68f746ea1ee6a6d9f7b..98a36a3308dc539ee5aecad9e71f50be310e584c 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -185,12 +185,14 @@ void CpuPassStrategy::EnableMKLDNN() {
     passes_.insert(passes_.begin(), "mkldnn_placement_pass");
 
     for (auto &pass : std::vector<std::string>({
-             "depthwise_conv_mkldnn_pass",    //
-             "conv_bn_fuse_pass",             // Execute BN passes again to
-             "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
-             "conv_transpose_bn_fuse_pass",   //
-             "conv_transpose_eltwiseadd_bn_fuse_pass",  //
-             "conv_bias_mkldnn_fuse_pass",              //
+             "depthwise_conv_mkldnn_pass",     //
+             "conv_bn_fuse_pass",              // Execute BN passes again to
+             "conv_eltwiseadd_bn_fuse_pass",   // preserve correct pass order
+             "conv_affine_channel_fuse_pass",  //
+             "conv_eltwiseadd_affine_channel_fuse_pass",  //
+             "conv_transpose_bn_fuse_pass",               //
+             "conv_transpose_eltwiseadd_bn_fuse_pass",    //
+             "conv_bias_mkldnn_fuse_pass",                //
              "conv_transpose_bias_mkldnn_fuse_pass",
              "conv3d_bias_mkldnn_fuse_pass",  //
              "conv_elementwise_add_mkldnn_fuse_pass",
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
index 24cd8e0368182ae597e48765bc0167ca1eca6bd3..5cfa3d86377874d0937964339a8b60a3ebd2486f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -54,7 +54,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
     auto ptr = new SkipLayerNormPluginDynamic(
         bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
     ptr->bias_gpu_ = bias_gpu_;
-    ptr->scale_gpu_ = bias_gpu_;
+    ptr->scale_gpu_ = scale_gpu_;
     return ptr;
   }
 
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 814deda6729278e2e9f9e76ff83bbdd4966821c1..2bd30bc05179e2881c4ecb321d76d5506233cc0e 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -515,3 +515,9 @@ if(WITH_MKLDNN)
 inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc 
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
         ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
+
+if(WITH_GPU)
+  inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${RESNET50_MODEL_DIR})
+endif()
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index 5840a4c42b3b1065410dc1509cf0cee2480bd596..31701c59ec33dfced5745f7f16d8f00ffce462ef 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -72,3 +72,59 @@ TEST(AnalysisPredictor, use_gpu) {
 
 }  // namespace inference
 }  // namespace paddle
+
+namespace paddle_infer {
+
+TEST(Predictor, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  Config config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableLiteEngine(PrecisionType::kFloat32);
+
+  auto predictor = CreatePredictor(config);
+  const int batch = 1;
+  const int channel = 3;
+  const int height = 318;
+  const int width = 318;
+  const int input_num = batch * channel * height * width;
+  std::vector<float> input(input_num, 1);
+
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+
+  input_t->Reshape({1, 3, 318, 318});
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  size_t out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                   std::multiplies<int>());
+
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+
+  const std::vector<float> truth_values = {
+      127.780396f, 738.16656f,  1013.2264f,  -438.17206f, 366.4022f,
+      927.66187f,  736.2241f,   -633.68567f, -329.92737f, -430.15637f,
+      -633.0639f,  -146.54858f, -1324.2804f, -1349.3661f, -242.67671f,
+      117.44864f,  -801.7251f,  -391.51495f, -404.8202f,  454.16132f,
+      515.48206f,  -133.03114f, 69.293076f,  590.09753f,  -1434.6917f,
+      -1070.8903f, 307.0744f,   400.52573f,  -316.12177f, -587.1265f,
+      -161.05742f, 800.3663f,   -96.47157f,  748.708f,    868.17645f,
+      -447.9403f,  112.73656f,  1127.1992f,  47.43518f,   677.7219f,
+      593.1881f,   -336.4011f,  551.3634f,   397.82474f,  78.39835f,
+      -715.4006f,  405.96988f,  404.25684f,  246.01978f,  -8.430191f,
+      131.36617f,  -648.0528f};
+
+  float* data_o = out_data.data();
+  for (size_t j = 0; j < out_num; j += 10) {
+    EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0.,
+                10e-5);
+  }
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fee7c35581d3293f0036360b64961910d9eb02a7
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda_runtime.h>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cstring>
+#include <numeric>
+
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+
+namespace paddle_infer {
+
+TEST(Predictor, use_gpu) {
+  LOG(INFO) << GetVersion();
+  UpdateDllFlag("conv_workspace_size_limit", "4000");
+  std::string model_dir = FLAGS_infer_model + "/model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableUseGpu(100, 0);
+
+  auto predictor = CreatePredictor(config);
+  auto pred_clone = predictor->Clone();
+
+  std::vector<int> in_shape = {1, 3, 318, 318};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+
+  std::vector<float> input(in_num, 0);
+
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+  predictor->ClearIntermediateTensor();
+}
+
+TEST(PredictorPool, basic) {
+  LOG(INFO) << GetVersion();
+  UpdateDllFlag("conv_workspace_size_limit", "4000");
+  std::string model_dir = FLAGS_infer_model + "/model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableUseGpu(100, 0);
+
+  services::PredictorPool pred_pool(config, 4);
+  auto pred = pred_pool.Retrive(2);
+
+  std::vector<int> in_shape = {1, 3, 318, 318};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+  std::vector<float> input(in_num, 0);
+
+  auto in_names = pred->GetInputNames();
+  auto input_t = pred->GetInputHandle(in_names[0]);
+  input_t->name();
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  pred->Run();
+  auto out_names = pred->GetOutputNames();
+  auto output_t = pred->GetOutputHandle(out_names[0]);
+  auto out_type = output_t->type();
+  LOG(INFO) << GetNumBytesOfDataType(out_type);
+  if (out_type == DataType::FLOAT32) {
+    PlaceType place;
+    int size;
+    output_t->data<float>(&place, &size);
+  }
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
index 8ffa3efdf0556bd7cde7efa615f60853ad18d903..c7c7356b6e8831bc0bcd0e9ea4ad0fbdec8b6be2 100644
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
@@ -41,7 +41,7 @@ TEST(AnalysisPredictor, use_gpu) {
   SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
 
   std::vector<PaddleTensor> outputs;
-  for (auto& input : inputs_all) {
+  for (auto &input : inputs_all) {
     ASSERT_TRUE(predictor->Run(input, &outputs));
     predictor->ClearIntermediateTensor();
   }
@@ -49,3 +49,27 @@ TEST(AnalysisPredictor, use_gpu) {
 
 }  // namespace inference
 }  // namespace paddle
+
+namespace paddle_infer {
+TEST(PredictorPool, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  Config config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
+  config.EnableTensorRtEngine();
+  services::PredictorPool pred_pool(config, 1);
+
+  auto predictor = pred_pool.Retrive(0);
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  std::vector<int> in_shape = {1, 3, 224, 224};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+
+  std::vector<float> input(in_num, 0);
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 63b3b0f1a3408154a2d1c8aff76a85a95ad044f6..81bb6881fae69b7af494449164f4fed35ade24da 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
@@ -1231,3 +1232,24 @@ REGISTER_OP_CPU_KERNEL(
     ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
                               ops::AbsGradFunctor<int64_t>>);
 /* ========================================================================== */
+
+/* ==========================  register checkpoint ===========================*/
+REGISTER_OP_VERSION(leaky_relu)
+    .AddCheckpoint(
+        R"ROC(fix leaky_relu, bahavior changed when alpha < 0 or alpha > 1)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .BugfixWithBehaviorChanged(
+                "leaky_relu calculate formula before checkponit: out = max(x, "
+                "alpha * x); after checkpoint: out = x if x > 0 else alpha * "
+                "x"));
+
+REGISTER_OP_VERSION(hard_shrink)
+    .AddCheckpoint(
+        R"ROC(fix hard_shrink, bahavior changed when threshold<0)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .BugfixWithBehaviorChanged(
+                "hard_shrink calculate formula before checkponit: out = x * "
+                "((x < -threshold) + (x > threshold)); after checkpoint: out = "
+                "x * (((x < -threshold) + (x > threshold)) > 0)"));
+
+/* ========================================================================== */
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 25e887ba6675e6c28bcd44c3b57c2ea571c075e3..7e0e77214c5320aa9a807fc65531f163fa7ce09e 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -567,3 +568,14 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
+
+REGISTER_OP_VERSION(conv_transpose)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade convtranspose add a new attribute [output_padding].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "output_padding",
+            "In order to add additional size to one side of each dimension "
+            "in the output",
+            {}));
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index a033611f478f9ea44fd49ab2015e78aaea6aacd9..e584e025088151cb9a6a64045387548d30a9eebf 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -56,7 +56,7 @@ endif()
 
 
 cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op)
+    DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op scale_op)
 cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index edbe945cd72bda15b506305dbfe80a3dbe085908..0983b4a406e042f094965ad9a7de437684940fa9 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -132,6 +132,15 @@ void ProcGetResponse(const VarHandle& var_h,
                             &trainer_id);
 }
 
+void ProcGetRecvResponse(const VarHandle& var_h,
+                         const ::grpc::ByteBuffer& ret_msg) {
+  VLOG(4) << "ProcGetRecvResponse";
+  framework::Variable* outvar = nullptr;
+  int trainer_id;
+  DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
+                                &trainer_id);
+}
+
 template <typename T>
 void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
   ::grpc::Slice slice(proto.ByteSizeLong());
@@ -482,6 +491,79 @@ VarHandlePtr GRPCClient::AsyncDistributeNotify(
   return h;
 }
 
+VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep,
+                                          const platform::DeviceContext& ctx,
+                                          const framework::Scope& scope,
+                                          const std::string& send_var_name,
+                                          const std::string& recv_var_name,
+                                          const std::string& table_name,
+                                          int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string send_var_name_val = send_var_name;
+  const std::string recv_var_name_val = recv_var_name;
+  const std::string table_name_val = table_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+  const std::string method = kSendAndRecvRPC;
+  VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: "
+          << send_var_name_val << " Recv_var_name: " << recv_var_name_val;
+  int retry_times_ = 0;
+
+  while (true) {
+    SendAndRecvProcessor* s = new SendAndRecvProcessor(ch);
+    VarHandlePtr h(
+        new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope));
+    VarHandlePtr h_recv(
+        new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope));
+    s->Prepare(h, time_out);
+    s->RecvPrepare(h_recv);
+
+    framework::AsyncIO([send_var_name_val, recv_var_name_val, table_name_val,
+                        p_scope, p_ctx, s, method, h, this] {
+      auto* send_var = p_scope->FindVar(send_var_name_val);
+      send_var->GetMutable<framework::LoDTensor>()->set_lod({});
+      ::grpc::ByteBuffer buf;
+      VLOG(4) << "SerializeToByteBuffer: send_var_name_val: "
+              << send_var_name_val
+              << " recv_var_name_val: " << recv_var_name_val;
+      SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf,
+                            recv_var_name_val, trainer_id_, table_name_val);
+
+      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+
+      // stub context
+      s->response_call_back_ = ProcGetRecvResponse;
+
+      platform::RecordRPCEvent record_event(method);
+
+      auto call = s->stub_g_.PrepareUnaryCall(
+          s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable",
+          buf, &cq_);
+      call->StartCall();
+      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+
+      if (UNLIKELY(platform::IsProfileEnabled())) {
+        h->Wait();
+      }
+    });
+    req_count_++;
+
+    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
+      h->Wait();
+      if (h->should_retry) {
+        VLOG(3) << "rpc call failed, retry times " << retry_times_;
+        retry_times_++;
+        std::random_device rd;
+        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
+        continue;
+      }
+    }
+
+    return h;
+  }
+}
+
 bool GRPCClient::Wait() {
   std::unique_lock<std::mutex> lk(sync_mutex_);
   sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
index bd9f25567dc07381ac8f9010b8a41bbe49c50017..6b6249540c6d15954743c414a60472bf1f831151 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -53,6 +53,8 @@ namespace distributed {
 
 void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
 
+void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
+
 class BaseProcessor {
  public:
   BaseProcessor() { context_ = nullptr; }
@@ -131,6 +133,28 @@ class GetProcessor : public BaseProcessor {
   RequestGetCallBack response_call_back_ = ProcGetResponse;
 };
 
+class SendAndRecvProcessor : public BaseProcessor {
+ public:
+  explicit SendAndRecvProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(), stub_g_(ch) {}
+
+  virtual ~SendAndRecvProcessor() {}
+
+  void ProcessImpl() override {
+    if (response_call_back_) {
+      response_call_back_(*var_h_recv_.get(), reply_);
+      var_h_recv_->Finish(true);
+    }
+  }
+
+  void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; }
+
+  ::grpc::ByteBuffer reply_;
+  ::grpc::GenericStub stub_g_;
+  RequestGetCallBack response_call_back_ = ProcGetResponse;
+  VarHandlePtr var_h_recv_;
+};
+
 class BatchBarrierProcessor : public BaseProcessor {
  public:
   explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
@@ -231,6 +255,14 @@ class GRPCClient : public RPCClient {
       const framework::Scope& scope, const std::string& var_name,
       int64_t time_out = FLAGS_rpc_deadline) override;
 
+  VarHandlePtr AsyncSendAndRecv(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& send_var_name,
+                                const std::string& recv_var_name,
+                                const std::string& table_name = "",
+                                int64_t time_out = FLAGS_rpc_deadline) override;
+
   VarHandlePtr AsyncSendComplete(
       const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
 
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
index bb9719eaad0447cbc298fbd7ed9ec635ae6df58d..eddd89cf20c2eb91e88d666a6ffe4a045df7298b 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -76,7 +76,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
     PADDLE_THROW("Serialize does not support type: %s",
                  typeid(var->Type()).name());
   }
-
   std::string header;
   request.AppendToString(&header);
   auto buffer = std::unique_ptr<char[]>(new char[1024]);
@@ -101,7 +100,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   }
 #endif
   PADDLE_ENFORCE_NOT_NULL(payload);
-
   e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
                             payload->memory_size());
   if (payload->memory_size() >= std::numeric_limits<int>::max()) {
@@ -140,7 +138,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         ::grpc::Slice::STEAL_REF);
     num_slices = 4;
   }
-
   ::grpc::ByteBuffer tmp(&slices[0], num_slices);
   msg->Swap(&tmp);
 }
@@ -156,6 +153,19 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
   *trainer_id = resp.GetTrainerId();
 }
 
+void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                                   const platform::DeviceContext& ctx,
+                                   const framework::Scope* scope,
+                                   framework::Variable** var, int* trainer_id) {
+  platform::RecordRPCEvent record_event("deserial");
+  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE_EQ(
+      resp.Parse(msg), 0,
+      platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
+  *var = resp.GetRecvVar();
+  *trainer_id = resp.GetTrainerId();
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
index c9a57beb3a6a7a7cc9973ff0e5325a3daa6d98a9..30e6907656e25bc7bcae77d3bd02638f6bb7601d 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
@@ -47,6 +47,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const framework::Scope* scope,
                                framework::Variable** var, int* trainer_id);
 
+void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                                   const platform::DeviceContext& ctx,
+                                   const framework::Scope* scope,
+                                   framework::Variable** var, int* trainer_id);
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index e7effcc1805f83eb16f07ceb7db53ce08983ad60..5c0232a50a9066f782be5269b4041958748c2e23 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -28,6 +28,7 @@ DECLARE_int32(rpc_retry_bind_port);
 namespace paddle {
 namespace operators {
 namespace distributed {
+
 enum CallStatus { PROCESS = 0, FINISH };
 
 // reference:
@@ -433,6 +434,51 @@ class RequestNotify final : public RequestBase {
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
 
+class RequestSendAndRecv final : public RequestBase {
+ public:
+  explicit RequestSendAndRecv(GrpcService::AsyncService* service,
+                              ::grpc::ServerCompletionQueue* cq,
+                              RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new GRPCVariableResponse(
+        request_handler->scope(), request_handler->dev_ctx(),
+        request_handler->distributed_mode()));
+
+    int method_id =
+        static_cast<int>(distributed::GrpcMethod::kRequestSendAndRecv);
+
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestSendAndRecv() {}
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    std::string in_var_name = request_->Varname();
+    std::string out_var_name = request_->OutVarname();
+    std::string table_name = request_->TableName();
+    int trainer_id = request_->GetTrainerId();
+
+    VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name
+            << " out_var_name: " << out_var_name << " trainer: " << trainer_id;
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = scope->FindVar(in_var_name);
+    framework::Variable* outvar = nullptr;
+    request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
+                             out_var_name, table_name);
+    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
+                          &reply_);
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  std::shared_ptr<GRPCVariableResponse> request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+};
+
 void AsyncGRPCServer::WaitServerReady() {
   VLOG(4) << "AsyncGRPCServer is waiting server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
@@ -586,6 +632,8 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
     b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id);
   } else if (rpc_name == kRequestNotify) {
     b = new RequestNotify(service_.get(), cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestSendAndRecv) {
+    b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id);
   } else {
     PADDLE_ENFORCE(false, "not supported rpc");
   }
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
index 45152293896e86806fe87324416c2588796558ba..95b6810ec61977b70617c9f20c2e75775157a6fb 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h
@@ -85,10 +85,12 @@ enum class GrpcMethod {
   kGetMonomerVariable,
   kGetMonomerBarrier,
   kRequestNotify,
+  kRequestSendAndRecv,
+  // when you add new handler, change kGrpcNumMethods at the same time!
 };
 
 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kRequestNotify) + 1;
+    static_cast<int>(GrpcMethod::kRequestSendAndRecv) + 1;
 
 inline const char* GrpcMethodName(GrpcMethod id) {
   switch (id) {
@@ -108,6 +110,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/CheckpointNotify";
     case GrpcMethod::kRequestNotify:
       return "/sendrecv.SendRecvService/DistributeNotify";
+    case GrpcMethod::kRequestSendAndRecv:
+      return "/sendrecv.SendRecvService/SendAndRecvVariable";
   }
 
   // Shouldn't be reached.
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 59531c0ec78ed8f0ec60a94d48069685e5b8c1a2..44359af1b1b2a6a161adcc83b97ea5fad96eecb0 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -46,6 +46,7 @@ constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
 constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
 constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
 constexpr char kRequestNotify[] = "RequestNotify";
+constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv";
 
 constexpr char kSendRPC[] = "SendRPC";
 constexpr char kGetRPC[] = "GetRPC";
@@ -57,6 +58,7 @@ constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
 constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
 constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
 constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
+constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC";
 constexpr int64_t kPrefetchTimeout = 60000;
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index e99b0ed4072645fcbc3ef4ce8728fc0f9cd912a3..761a4edc523da52ffdbdd2039444c133e8da368c 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -325,6 +325,22 @@ bool RequestNotifyHandler::Handle(const std::string &varname,
   return true;
 }
 
+bool RequestSendAndRecvHandler::Handle(const std::string &varname,
+                                       framework::Scope *Scope,
+                                       framework::Variable *var,
+                                       framework::Variable **outvar,
+                                       const int trainer_id,
+                                       const std::string &out_var_name,
+                                       const std::string &table_name) {
+  VLOG(3) << "SendAndRecvHandle: " << varname
+          << " out_var_name: " << out_var_name
+          << " , trainer_id:  " << trainer_id;
+
+  executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope);
+  *outvar = Scope->FindVar(out_var_name);
+  return true;
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index f22a133c2d5b1196a672f978d76d1c362f616bf6..42621724e68f40617bebd2b01e2af5dd23387163 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -176,6 +176,17 @@ class RequestNotifyHandler final : public RequestHandler {
   std::unordered_map<int, int64_t> decay_counters;
 };
 
+class RequestSendAndRecvHandler final : public RequestHandler {
+ public:
+  explicit RequestSendAndRecvHandler(int distributed_mode)
+      : RequestHandler(distributed_mode) {}
+  virtual ~RequestSendAndRecvHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* Scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
+};
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 62313222775c662b78bfab5827cd5b418a2a0997..69a5e3274318337f5424afa6492da829e04daa69 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -85,6 +85,12 @@ class RPCClient {
       const framework::Scope& scope, const std::string& var_name,
       int64_t time_out = FLAGS_rpc_deadline) = 0;
 
+  virtual VarHandlePtr AsyncSendAndRecv(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& send_var_name,
+      const std::string& recv_var_name, const std::string& table_name = "",
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
   virtual VarHandlePtr AsyncSendComplete(
       const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
 
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index 67e11120b808e26df590440389c71f3340738082..5ce7ac85269572ea7d2b6a015bb6c9d106f8199e 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -35,27 +35,24 @@ namespace platform = paddle::platform;
 namespace distributed = paddle::operators::distributed;
 
 USE_NO_KERNEL_OP(lookup_sparse_table_read);
+USE_OP(scale);
 
 std::unique_ptr<distributed::RPCServer> g_rpc_service;
 std::unique_ptr<distributed::RequestHandler> g_req_handler;
 
-framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
+framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
   auto root_block = program->MutableBlock(0);
   auto* block = program->AppendBlock(*root_block);
 
-  framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
-  framework::VariableNameMap output({{"Output", {"out"}}});
-  auto op = block->AppendOp();
-  op->SetType("lookup_sparse_table_read");
-  op->SetInput("W", {"w"});
-  op->SetInput("Ids", {"ids"});
-  op->SetOutput("Out", {"out"});
-  op->SetAttr("tablename", {"w"});
-  op->SetAttr("value_names", {"Param"});
-
-  auto& out = *root_block->Var("out");
+  framework::OpDesc* op = block->AppendOp();
+  op->SetType("scale");
+  op->SetInput("X", {"x"});
+  op->SetOutput("Out", {"res"});
+  op->SetAttr("scale", 0.5f);
+
+  auto& out = *root_block->Var("res");
   out.SetType(framework::proto::VarType::LOD_TENSOR);
-  out.SetShape({10, 10});
+  out.SetShape({1, 10});
 
   return block;
 }
@@ -69,6 +66,12 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
 
   auto ids_var = scope->Var("ids");
   ids_var->GetMutable<framework::LoDTensor>();
+
+  auto x_var = scope->Var("x");
+  x_var->GetMutable<framework::LoDTensor>();
+
+  auto res_var = scope->Var("res");
+  res_var->GetMutable<framework::LoDTensor>();
 }
 
 void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
@@ -78,6 +81,11 @@ void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
   int64_t* ids_ptr =
       ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
   for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
+
+  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
+  float* x_ptr =
+      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
 }
 
 void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
@@ -124,6 +132,38 @@ void StartServer(const std::string& rpc_name) {
   server_thread.join();
 }
 
+void StartSendAndRecvServer(const std::string& rpc_name) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+  auto block = AppendSendAndRecvBlock(&program);
+  std::string in_var_name("x");
+  std::vector<int> prefetch_block_ids{block->ID()};
+  auto prepared = exe.Prepare(program, prefetch_block_ids);
+  InitTensorsOnServer(&scope, &place, 10);
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      grad_to_prepared_ctx;
+  grad_to_prepared_ctx[in_var_name] = prepared[0];
+
+  g_req_handler->SetProgram(&program);
+  g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  g_req_handler->SetDevCtx(&ctx);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetExecutor(&exe);
+
+  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
+
+  std::thread server_thread(
+      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
+
+  server_thread.join();
+}
+
 TEST(COMPLETE, CPU) {
   setenv("http_proxy", "", 1);
   setenv("https_proxy", "", 1);
@@ -147,3 +187,46 @@ TEST(COMPLETE, CPU) {
   g_rpc_service.reset(nullptr);
   g_req_handler.reset(nullptr);
 }
+
+TEST(SENDANDRECV, CPU) {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  g_req_handler.reset(new distributed::RequestSendAndRecvHandler(
+      distributed::DistributedMode::kAsync));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+  PADDLE_ENFORCE_NE(client, nullptr,
+                    platform::errors::InvalidArgument(
+                        "Client Start Fail, Check Your Code & Env"));
+  std::thread server_thread(StartSendAndRecvServer,
+                            distributed::kRequestSendAndRecv);
+  g_rpc_service->WaitServerReady();
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  // create var on local scope
+  int64_t rows_numel = 10;
+  InitTensorsOnClient(&scope, &place, rows_numel);
+  std::string in_var_name("x");
+  std::string out_var_name("res");
+
+  client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name);
+  client->Wait();
+  auto var = scope.Var(out_var_name);
+  auto value = var->GetMutable<framework::LoDTensor>();
+  auto ptr = value->mutable_data<float>(place);
+
+  for (int64_t i = 0; i < rows_numel; ++i) {
+    EXPECT_EQ(ptr[i], 0.5);
+  }
+  g_rpc_service->ShutDown();
+  server_thread.join();
+  LOG(INFO) << "begin reset";
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index 0337b72181cf9f612fe56ae24bad39775bfcde28..a333642bd16fbfbe648a835101d67218bf473cdb 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -29,7 +29,7 @@ service SendRecvService {
 
   rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
   rpc DistributeNotify(VariableMessage) returns (VoidMessage) {}
-
+  rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {}
   rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
   rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index 3cabcd22cd52222aff2555a8449e558de2c287c0..d979cd8a881ec7d697eae06b4911d597730b6908 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -96,6 +96,13 @@ class VariableResponse {
     return scope_->FindVar(meta_.varname());
   }
 
+  framework::Variable* GetRecvVar() {
+    if (create_scope_) {
+      return local_scope_->Var(meta_.out_varname());
+    }
+    return scope_->FindVar(meta_.out_varname());
+  }
+
   int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }
 
  protected:
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 5869407be5a5750d3948f87fe8743adf0a425422..5e1e408eb2c28239fded0d0cf037c94783828b50 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -268,7 +268,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
-
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
     block_list.push_back(blkid);
@@ -295,6 +294,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
   request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
   request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
   request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_send_and_recv_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
 
   while (true) {
     if (rpc_service_->IsExit()) {
@@ -394,6 +394,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
       new distributed::RequestGetNoBarrierHandler());
   request_notify_handler_.reset(
       new distributed::RequestNotifyHandler(distributed_mode, fan_in));
+  request_send_and_recv_handler_.reset(
+      new distributed::RequestSendAndRecvHandler(distributed_mode));
 
   rpc_service_->RegisterRPC(distributed::kRequestSend,
                             request_send_handler_.get(), rpc_send_thread_num);
@@ -408,6 +410,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                             request_get_no_barrier_handler_.get());
   rpc_service_->RegisterRPC(distributed::kRequestNotify,
                             request_notify_handler_.get(), rpc_send_thread_num);
+  rpc_service_->RegisterRPC(distributed::kRequestSendAndRecv,
+                            request_send_and_recv_handler_.get(),
+                            rpc_get_thread_num);
 
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -416,6 +421,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                         "optimize blocks is less than 1. Optimize blocks "
                         "should be 1 at least on the pserver side."));
   auto *program = optimize_blocks[0]->Program();
+
   framework::Executor executor(dev_place);
 
   std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
@@ -488,6 +494,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   f(request_checkpoint_handler_.get());
   f(request_get_no_barrier_handler_.get());
   f(request_notify_handler_.get());
+  f(request_send_and_recv_handler_.get());
 
   // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
   signal(SIGINT, SignalHandler::StopAndExit);
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
index 369743dfb2392c029bc3b671e519aefbbdd2b6b7..b41e4e87722f638e6661a5116ebdfbc02c32710f 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
@@ -99,6 +99,8 @@ class ListenAndServOp : public framework::OperatorBase {
   mutable std::shared_ptr<distributed::RequestHandler>
       request_checkpoint_handler_;
   mutable std::shared_ptr<distributed::RequestHandler> request_notify_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_send_and_recv_handler_;
 
   mutable std::shared_ptr<std::thread> server_thread_;
   mutable std::vector<std::string> sparse_vars_;
diff --git a/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc b/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..00cdbe70ca47e6e0bba8294b3b81c804b096339c
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/distributed/communicator.h"
+#include "paddle/fluid/operators/distributed/communicator_common.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/parameter_send.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SendAndRecvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& scope = ctx.scope();
+    const auto& place = ctx.GetPlace();
+    auto send_var_name = ctx.Attr<std::string>("send_var_name");
+    auto recv_var_name = ctx.Attr<std::string>("recv_var_name");
+    auto epmap = ctx.Attr<std::string>("endpoint");
+    auto trainer_id = ctx.Attr<int>("trainer_id");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& context = *pool.Get(place);
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
+    VLOG(3) << "SendAndRecvOp Send_var_name: " << send_var_name
+            << " Recv_var_name: " << recv_var_name;
+    distributed::VarHandlePtr rets = rpc_client->AsyncSendAndRecv(
+        epmap, context, scope, send_var_name, recv_var_name);
+    rets->Wait();
+  }
+};
+
+class SendAndRecvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
+    AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
+    AddAttr<std::string>("send_var_name", "Send Tensor's name")
+        .SetDefault(std::string(""));
+    AddAttr<std::string>("recv_var_name", "Recv Tensor's name")
+        .SetDefault(std::string(""));
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::string>("endpoint", "Server endpoint")
+        .SetDefault({"127.0.0.1:6164"});
+    AddComment(R"DOC(
+    SendAndRecv operator
+    This operator will send variables to listen_and_serve op at the parameter server.
+    And recv variable from parameter server of send variable's scope.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    send_and_recv,
+    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index bde7131379a272e31fb1effe2f92204fa27f9a14..e3da79125be24f3156b10a4d1daedd3db2b841cf 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -24,49 +24,69 @@ class AdadeltaOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"),
-                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
-                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Param) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Grad) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("AvgSquaredGrad"), true,
+        platform::errors::InvalidArgument(
+            "Input(AvgSquaredGrad) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("AvgSquaredUpdate"), true,
+        platform::errors::InvalidArgument(
+            "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
         ctx->GetInputsVarType("Param").front() ==
             framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
+        true,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Param").front(),
+            ctx->GetInputsVarType("Param").front()));
+    PADDLE_ENFORCE_EQ(
         ctx->GetInputsVarType("Grad").front() ==
             framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredGradOut"),
-        "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredUpdateOut"),
-        "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.");
+        true,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Grad").front(),
+            ctx->GetInputsVarType("Grad").front()));
+
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("ParamOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(ParamOut) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("AvgSquaredGradOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(AvgSquaredGradOut) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("AvgSquaredUpdateOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."));
 
     auto param_dim = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
         param_dim, ctx->GetInputDim("Grad"),
         "param and grad input of AdadeltaOp should have same dimension");
-    PADDLE_ENFORCE_NE(framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
-                      "Maybe the Input variable AvgSquaredGrad has not "
-                      "been initialized. You may need to confirm if you put "
-                      "exe.run(startup_program) after optimizer.minimize "
-                      "function.");
+    PADDLE_ENFORCE_NE(
+        framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
+        platform::errors::InvalidArgument(
+            "Maybe the Input variable AvgSquaredGrad has not "
+            "been initialized. You may need to confirm if you put "
+            "exe.run(startup_program) after optimizer.minimize "
+            "function."));
     PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
-                      "Param and AvgSquaredGrad input of AdadeltaOp "
-                      "should have same dimension");
+                      platform::errors::InvalidArgument(
+                          "Param and AvgSquaredGrad input of AdadeltaOp "
+                          "should have same dimension"));
     PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
-                      "Param and AvgSquaredUpdate input of AdadeltaOp "
-                      "should have same dimension");
+                      platform::errors::InvalidArgument(
+                          "Param and AvgSquaredUpdate input of AdadeltaOp "
+                          "should have same dimension"));
 
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h
index e66dec7cf0ff686f91103e438b6374fce29af774..85cfad35858bbe6b112169f196c0711d981e9446 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.h
+++ b/paddle/fluid/operators/optimizers/adadelta_op.h
@@ -24,17 +24,19 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
     const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto avg_squared_grad_out_tensor =
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 1fbf6d00ef763f4cb608be6d62cf4bff54f620ec..d3f9754d307c6040a66a3452d7bb008159ff46e5 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -23,22 +23,27 @@ class TopkOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
-                   "Output(Indices) of TopkOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of TopkOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Out) of TopkOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Indices"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Indices) of TopkOp should not be null."));
 
     auto input_dims = ctx->GetInputDim("X");
     const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
 
     PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
+    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
+                                                "input must have >= 1d shape"));
 
     if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
-                        "input must have >= k columns");
+      PADDLE_ENFORCE_GE(
+          input_dims[input_dims.size() - 1], k,
+          platform::errors::InvalidArgument("input must have >= k columns"));
     }
 
     framework::DDim dims = input_dims;
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index d8b2e92616091a8c822c6fd0bfdfb1148c25534d..0a694e1ad5b012d70a89ddcca2d70fbe8c9e24ba 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -43,8 +43,9 @@ template <typename DeviceContext, typename T>
 class TopkOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("It must use CUDAPlace."));
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index cf0dac022f74e47261fc28d02665bcde49dc8b39..040dd313f1c538b5792538f9da04635ff805b9a8 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -206,9 +206,9 @@ void BindInferenceApi(py::module *m) {
   BindMkldnnQuantizerConfig(m);
 #endif
   m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<AnalysisConfig>);
+         &paddle::CreatePaddlePredictor<AnalysisConfig>, py::arg("config"));
   m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<NativeConfig>);
+         &paddle::CreatePaddlePredictor<NativeConfig>, py::arg("config"));
   m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
   m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
 }
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a77d605eb6c26b02c38a58195d1f8f1e84a3dc20..926747ef6186e3b9439baf787572fe9d1988fb46 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1399,6 +1399,9 @@ function main() {
     local CMD=$1 
     local parallel_number=$2
     init
+    if [ "$CMD" != "assert_file_approvals" ];then
+      python ${PADDLE_ROOT}/tools/summary_env.py
+    fi
     case $CMD in
       build_only)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index d66577102c713ae796abdf812f1cf2bb767e5b14..b7357eef7ad9a3abae7f9c1c09fdc00b409061ad 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -30,8 +30,11 @@ __all__ = ["spawn"]
 
 # dygraph parallel apis
 __all__ += [
-    "init_parallel_env", "get_rank", "get_world_size", "prepare_context",
-    "ParallelEnv"
+    "init_parallel_env",
+    "get_rank",
+    "get_world_size",
+    "prepare_context",
+    "ParallelEnv",
 ]
 
 # collective apis
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index b080fb17553d4a93a545f4ae781d786d82e26576..42ac68ba1a64de54f029878ceab08435c924d087 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -18,16 +18,15 @@ from .base.distributed_strategy import DistributedStrategy
 from .base.fleet_base import Fleet
 from .base.util_factory import UtilBase
 from .dataset import *
+#from . import metrics
 
 __all__ = [
     "DistributedStrategy",
     "UtilBase",
     "DatasetFactory",
-    "DatasetBase",
-    "InMemoryDataset",
-    "QueueDataset",
     "UserDefinedRoleMaker",
     "PaddleCloudRoleMaker",
+    "Fleet",
 ]
 
 fleet = Fleet()
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 5e527ea03ab9c816948f343ac103672a751fdbc3..26063d1b8a9225aff63628bb37f433ec95257dc7 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -17,6 +17,8 @@ from paddle.distributed.fleet.proto import distributed_strategy_pb2
 from paddle.fluid.framework import Variable, set_flags, core
 import google.protobuf.text_format
 
+__all__ = ["DistributedStrategy"]
+
 
 def get_msg_dict(msg):
     res_dict = {}
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index a6286bcca87fad1afddbd8af1e56dad05dab2578..f4a16d0de177f8a63271ef43f6716aa31443f06f 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -22,7 +22,7 @@ from .runtime_factory import RuntimeFactory
 from .util_factory import UtilFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
 
-__all__ = ['Fleet']
+#__all__ = ['Fleet']
 
 
 def _inited_runtime_handler_(func):
@@ -85,7 +85,7 @@ class Fleet(object):
         This function is responsible for the distributed architecture 
         what you want to run your code behind,such as Transpiler,
         Collective in PaddleCloudRoleMaker or UserDefinedRoleMaker 
-        
+
         """
         if isinstance(role_maker, RoleMakerBase):
             self._role_maker = role_maker
@@ -112,7 +112,7 @@ class Fleet(object):
         Returns:
             bool: True if this is the first node of worker,
                   False if not.
-        
+
         """
         return self._role_maker.is_first_worker()
 
@@ -200,7 +200,8 @@ class Fleet(object):
             bool: True if this is a node of server,
                   False if not.
         """
-        return self._role_maker.is_server()
+        return self._role_maker.is_server(
+        ) or self._role_maker._is_heter_worker()
 
     @property
     def util(self):
@@ -372,10 +373,10 @@ class Fleet(object):
                 can_not_apply_optimizer_list.append(opt)
         # combine recalled meta optimizers to be a valid meta optimizer
         meta_optimizer, graph_optimizer = \
-                self.strategy_compiler.generate_optimizer(
-                    loss, self._role_maker, self.user_defined_optimizer,
-                    self.user_defined_strategy, valid_optimizer_list,
-                    valid_graph_optimizer_list)
+            self.strategy_compiler.generate_optimizer(
+                loss, self._role_maker, self.user_defined_optimizer,
+                self.user_defined_strategy, valid_optimizer_list,
+                valid_graph_optimizer_list)
 
         valid_strategy = self.strategy_compiler._get_valid_strategy(
             self.user_defined_strategy, can_not_apply_optimizer_list)
diff --git a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
index 459070fcc4dbef3711c33b2932e8f1c88647aab5..f845b3fcd8953c44c8b5b857dac08be1c7269958 100755
--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ["MetaOptimizerFactory"]
-
 from ..meta_optimizers import *
 
 meta_optimizer_names = list(
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 6aeeb4a2896ea1d20390e463937aa07d3edd0204..25f2d0dd3f45855d9f337c6b7154db9cb5bbae45 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -14,15 +14,17 @@
 """Defination of Role Makers."""
 import os
 import numpy as np
+import warnings
 from multiprocessing import Process, Manager
 import paddle.fluid as fluid
 
-__all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+#__all__ = ['UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
 
 
 class Role:
     WORKER = 1
     SERVER = 2
+    HETER_WORKER = 3
 
 
 class RoleMakerBase(object):
@@ -40,6 +42,11 @@ class RoleMakerBase(object):
         self._role = None
         self._current_id = -1
 
+        # for heter parameter server mode
+        self._heter_trainer_endpoints = []
+        self._heter_trainer_device = "CPU"
+        self._is_heter_parameter_server_mode = False
+
         self._node_type = None
         self._node_type_comm = None
         self._all_comm = None
@@ -163,12 +170,58 @@ class RoleMakerBase(object):
         """
         print("warning: RoleMakerBase does not have barrier worker.")
 
+    def _is_heter_worker(self):
+        """
+        Return is_heter_worker() of current process
+        """
+        warnings.warn("RoleMakerBase does not have function: _is_heter_worker.")
+        return False
+
+    def _heter_worker_num(self):
+        """
+        Get current total heter-worker number.
+
+        Returns:
+            int: heter_worker number
+        """
+        warnings.warn(
+            "RoleMakerBase does not have function: _heter_worker_num.")
+        return 0
+
+    def _get_heter_worker_endpoints(self):
+        """
+        Returns:
+            string: all heter_trainers'endpoints
+        """
+        assert self._heter_trainer_endpoints != []
+        return self._heter_trainer_endpoints
+
+    def _get_heter_worker_endpoint(self):
+        """
+        Returns:
+            int: corresponding heter_trainer's endpoint
+
+        e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter)
+             then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer
+             and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainerr
+        """
+        assert self._heter_trainer_endpoints != []
+        return self._heter_trainer_endpoints[(self._current_id + 1) %
+                                             self._heter_worker_num()]
+
+    def _get_heter_worker_device(self):
+        """
+        Returns:
+            string: heter_trainer's device of current node, e.g: CPU/GPU/XPU
+        """
+        return self._heter_trainer_device.upper()
+
 
 class PaddleCloudRoleMaker(RoleMakerBase):
     def __init__(self, is_collective=False, **kwargs):
         super(PaddleCloudRoleMaker, self).__init__()
         self._is_collective = is_collective
-        self._init_gloo = False  #default no init gloo
+        self._init_gloo = False  # default no init gloo
         self._kwargs = kwargs
 
         self._role_is_generated = False
@@ -278,10 +331,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         """
         get index of current node
         """
-        if self.is_server():
-            return self.server_index()
-        elif self.is_worker():
-            return self.worker_index()
+        return self._current_id
 
     def worker_num(self):
         """
@@ -323,6 +373,22 @@ class PaddleCloudRoleMaker(RoleMakerBase):
             self.generate_role()
         return self._server_endpoints
 
+    def _heter_worker_num(self):
+        """
+        get heter worker nums
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._heter_trainers_num
+
+    def _is_heter_worker(self):
+        """
+        whether current process is heter worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.HETER_WORKER
+
     def _get_rank(self):
         """
         get current rank in all workers and pservers
@@ -342,17 +408,47 @@ class PaddleCloudRoleMaker(RoleMakerBase):
     def _ps_env(self):
         try:
             # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
-            # format: string(ip:port), eg. 127.0.0.1:6001
-            self._server_endpoints = os.environ[
-                "PADDLE_PSERVERS_IP_PORT_LIST"].split(",")
+            # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
+            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST",
+                                               "").split(",")
+            assert self._server_endpoints != ""
             self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                                "").split(",")
+            assert self._server_endpoints != ""
 
             trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
             training_role = os.environ["TRAINING_ROLE"]
 
-            if training_role not in ["TRAINER", "PSERVER"]:
-                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
+            if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
+                raise ValueError(
+                    "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
+                    format(training_role))
+
+            # For heter parameter server env setting
+            heter_trainer_eplist = os.getenv(
+                "PADDLE_HETER_TRAINER_IP_PORT_LIST", None)
+            heter_trainer_device = os.getenv("PADDLE_HETER_TRAINER_DEVICE",
+                                             None)
+            if heter_trainer_eplist and heter_trainer_device:
+                try:
+                    heter_trainer_eplist = os.environ[
+                        "PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",")
+                except:
+                    raise ValueError(
+                        "Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ."
+                    )
+
+                self._is_heter_parameter_server_mode = True
+                heter_trainers_num = len(heter_trainer_eplist)
+                current_node_device = heter_trainer_device.upper()
+                if current_node_device not in ["CPU", "GPU", "XPU"]:
+                    raise ValueError(
+                        "Heter Trainer doesn't support {} device now, please use CPU / GPU / XPU(KunLun)".
+                        format(heter_trainer_device))
+                self._heter_trainer_device = current_node_device
+            else:
+                self._is_heter_parameter_server_mode = False
+                heter_trainers_num = 0
 
             if training_role == "TRAINER":
                 role = Role.WORKER
@@ -365,17 +461,26 @@ class PaddleCloudRoleMaker(RoleMakerBase):
                 ip = os.environ["POD_IP"]
                 self._cur_endpoint = ip + ":" + port
                 current_id = self._server_endpoints.index(self._cur_endpoint)
+            elif training_role == "HETER_TRAINER":
+                role = Role.HETER_WORKER
+                cur_ip = os.environ["POD_IP"]
+                cur_port = os.environ["PADDLE_PORT"]
+                curr_endpoint = ":".join([cur_ip, cur_port])
+                current_id = heter_trainer_eplist.index(curr_endpoint)
             else:
-                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
-        except ValueError as ve:
+                raise ValueError(
+                    "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER")
+        except ValueError as e:
             raise ValueError(
-                "something wrong with PaddleCloud, please check environment")
+                "Something wrong with PaddleCloud, please check environment")
 
         self._trainers_num = trainers_num
         self._role = role
         self._current_id = current_id
         self._node_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
+        self._heter_trainers_num = heter_trainers_num
+        self._heter_trainer_endpoints = heter_trainer_eplist
 
     def _collective_env(self):
         self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index 075e8b6c4302d792606849fc2981e46ccead1e56..d98b2ef3e2a083861647b2847bafad3b08c86cfd 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -15,24 +15,10 @@ from .amp_optimizer import AMPOptimizer
 from .recompute_optimizer import RecomputeOptimizer
 from .gradient_merge_optimizer import GradientMergeOptimizer
 from .graph_execution_optimizer import GraphExecutionOptimizer
-from .async_optimizer import AsyncMetaOptimizer
+from .parameter_server_optimizer import ParameterServerOptimizer
 from .pipeline_optimizer import PipelineOptimizer
 from .localsgd_optimizer import LocalSGDOptimizer
 from .lars_optimizer import LarsOptimizer
-from .async_graph_execution_optimizer import AsyncGraphExecutionOptimizer
+from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
 from .dgc_optimizer import DGCOptimizer
 from .lamb_optimizer import LambOptimizer
-
-__all__ = [
-    'AMPOptimizer',
-    'RecomputeOptimizer',
-    'GradientMergeOptimizer',
-    'AsyncMetaOptimizer',
-    'GraphExecutionOptimizer',
-    'PipelineOptimizer',
-    'LocalSGDOptimizer',
-    'LarsOptimizer',
-    'AsyncGraphExecutionOptimizer',
-    'DGCOptimizer',
-    'LambOptimizer',
-]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 66db14209b4c57475c30c6dde083593e27f04ea0..b1952276e44cd1466bc443440505462924115ab7 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -14,8 +14,6 @@
 import paddle.fluid.contrib.mixed_precision as mixed_precision
 from .meta_optimizer_base import MetaOptimizerBase
 
-__all__ = ["AMPOptimizer"]
-
 
 class AMPOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index f34786f9dc309dd1f03319368bbc93ef1bfc03e3..f1c6defc5c982c7d56980642898aaa333c199bbe 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -15,8 +15,6 @@ from paddle.fluid.optimizer import Momentum, DGCMomentumOptimizer
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
-__all__ = ["DGCOptimizer"]
-
 
 class DGCOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index bd52179a35862241768ad5bd01eedf16732ad3b6..7db79ad7b5b7081172209faa2396d9f2a31bbdb3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -14,10 +14,6 @@
 from paddle.fluid.optimizer import GradientMergeOptimizer as GM
 from .meta_optimizer_base import MetaOptimizerBase
 
-__all__ = ["GradientMergeOptimizer"]
-
-# amp + gradient merge + lamb
-
 
 class GradientMergeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index 7e08a02eb1dc2e14b1871fe7743bbee8ade3feb3..9fa29c4078e9f579a740ef8c0591979e7fbb962d 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -16,8 +16,6 @@ from paddle.fluid.optimizer import LambOptimizer as LAMB
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
-__all__ = ["LambOptimizer"]
-
 
 class LambOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index 09c418fa79106d05cffae1e8bc18fac9c0cc8f34..a7b856ff5b0dcb1ab30de82a12c91a2e1c14fe76 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -15,8 +15,6 @@ from paddle.fluid.optimizer import Momentum, LarsMomentumOptimizer
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
-__all__ = ["LarsOptimizer"]
-
 
 class LarsOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index 12a4d904340337bf9a99968c7d82db117bf59ce8..073148e11a0a2b08253b89d36d7a014b830518f8 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ["MetaOptimizerBase"]
-
 from paddle.fluid.optimizer import Optimizer
 
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
similarity index 88%
rename from python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
index c0dee220aafd07bf69a198c6b03e6c957c50d4ce..878ed7422d733d3e2828e0395ec63ed16b4c489a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -13,12 +13,12 @@
 
 from paddle import fluid
 from paddle.fluid import compiler
-from .async_optimizer import AsyncMetaOptimizer
+from .parameter_server_optimizer import ParameterServerOptimizer
 
 
-class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer):
+class ParameterServerGraphOptimizer(ParameterServerOptimizer):
     def __init__(self, optimizer):
-        super(AsyncGraphExecutionOptimizer, self).__init__(optimizer)
+        super(ParameterServerGraphOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
@@ -31,6 +31,9 @@ class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer):
         if self.role_maker.is_server():
             return False
 
+        if self.role_maker._is_heter_parameter_server_mode:
+            return False
+
         return True
 
     def _disable_strategy(self, dist_strategy):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
similarity index 82%
rename from python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index b65435497284d279ebdea026e7ac88883a724c7c..ecb198bedf9041aa3ffc929a72cce3c209f03b61 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -15,9 +15,9 @@ from paddle import fluid
 from .meta_optimizer_base import MetaOptimizerBase
 
 
-class AsyncMetaOptimizer(MetaOptimizerBase):
+class ParameterServerOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
-        super(AsyncMetaOptimizer, self).__init__(optimizer)
+        super(ParameterServerOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
@@ -68,6 +68,21 @@ class AsyncMetaOptimizer(MetaOptimizerBase):
             _startup = worker.init_from_server_pass(_startup, compiled_config)
             _startup = worker.delet_extra_optimizes_pass(_startup,
                                                          compiled_config)
+
+            # for heter program
+            if self.role_maker._is_heter_parameter_server_mode:
+                from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
+                if self.role_maker._is_heter_worker():
+                    # for heter worker
+                    _main = heter_worker.split_heter_worker_ops_pass(
+                        _main, compiled_config)
+                else:
+                    # for default worker
+                    _main = heter_worker.split_trainer_ops_pass(_main,
+                                                                compiled_config)
+                # for startup change
+                _startup = heter_worker.delete_startup_useless_ops_var_pass(
+                    _startup, _main, compiled_config)
         else:
             _main = worker.append_send_ops_pass(_main, compiled_config)
             _startup = _startup
@@ -129,9 +144,12 @@ class AsyncMetaOptimizer(MetaOptimizerBase):
                                                      _origin_startup_program,
                                                      strategy, self.role_maker)
 
-        main_program, startup_program = \
-            self._build_trainer_programs(compiled_config) if self.role_maker.is_worker() \
-                else self._build_pserver_programs(compiled_config)
+        if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
+            main_program, startup_program = self._build_trainer_programs(
+                compiled_config)
+        elif self.role_maker.is_server():
+            main_program, startup_program = self._build_pserver_programs(
+                compiled_config)
 
         loss.block.program = main_program
         fluid.framework.switch_startup_program(startup_program)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index fe9221307cbacfa1beaf030b70a4e4b9223769cc..d5a45e2b4e1aeda2e1c66c0a5a36236622f093ec 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -20,8 +20,6 @@ from paddle.fluid.optimizer import PipelineOptimizer as PO
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
 
-__all__ = ["PipelineOptimizer"]
-
 
 class PipelineHelper(CollectiveHelper):
     def __init__(self, role_maker, nrings=1, wait_port='6174'):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 45130b447125f6ecbade2e4e5e3dad2f127fda52..3eb3ca6127cfe0d0a7a458c6c44e09ce22e7b24a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -14,8 +14,6 @@
 from paddle.fluid.optimizer import RecomputeOptimizer as RO
 from .meta_optimizer_base import MetaOptimizerBase
 
-__all__ = ["RecomputeOptimizer"]
-
 
 class RecomputeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/metrics/__init__.py b/python/paddle/distributed/fleet/metrics/__init__.py
index abf198b97e6e818e1fbe59006f98492640bcee54..bc30c063787d28e5bcb4455b3cbd56372879fe0a 100644
--- a/python/paddle/distributed/fleet/metrics/__init__.py
+++ b/python/paddle/distributed/fleet/metrics/__init__.py
@@ -11,3 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .metric import *
+
+__all__ = [
+    "sum",
+    "max",
+    "min",
+    "auc",
+    "mae",
+    "rmse",
+    "mse",
+    "acc",
+]
diff --git a/python/paddle/distributed/fleet/runtime/__init__.py b/python/paddle/distributed/fleet/runtime/__init__.py
index a796a73fc981b7edbcd57e8f5858456031e7ae6e..cf718b199e52e422ff8f2b66317f3cd6123c76a1 100644
--- a/python/paddle/distributed/fleet/runtime/__init__.py
+++ b/python/paddle/distributed/fleet/runtime/__init__.py
@@ -14,5 +14,3 @@
 
 from .collective_runtime import CollectiveRuntime
 from .parameter_server_runtime import ParameterServerRuntime
-
-__all__ = ["CollectiveRuntime," "ParameterServerRuntime", ]
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index c731ed08893348d0be604eb383905cd4a9d6e228..1741f10ccb1c28bfe6abaa63e754568fa08e21ce 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -196,6 +196,18 @@ class ParameterServerRuntime(RuntimeBase):
         else:
             warnings.warn("communicator has been initialized, skip")
 
+    def _get_executor(self):
+        if self.role_maker._is_heter_worker():
+            if self.role_maker._get_heter_worker_device() == "GPU":
+                gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+                executor = Executor(fluid.CUDAPlace(gpu_id))
+            else:
+                raise ValueError("Not Support Device {}".format(
+                    self.role_maker._get_heter_worker_device()))
+        else:
+            executor = fluid.Executor(fluid.CPUPlace())
+        return executor
+
     def _init_server(self, *args, **kwargs):
         if len(args) > 1:
             raise ValueError("init server can only accept 1 args: `dirname`")
@@ -204,9 +216,15 @@ class ParameterServerRuntime(RuntimeBase):
         else:
             model_dirname = None
 
-        executor = fluid.Executor(fluid.CPUPlace())
+        if self.role_maker._is_heter_worker():
+            self._init_worker()
+
+        executor = self._get_executor()
         executor.run(fluid.default_startup_program())
 
+        if self.role_maker._is_heter_worker():
+            return
+
         if not model_dirname:
             return
 
@@ -237,12 +255,12 @@ class ParameterServerRuntime(RuntimeBase):
         # self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames)
 
     def _run_server(self):
-        executor = fluid.Executor(fluid.CPUPlace())
+        executor = self._get_executor()
         executor.run(fluid.default_main_program())
 
     def _stop_worker(self):
         self._communicator.stop()
-        executor = fluid.Executor(fluid.CPUPlace())
+        executor = self._get_executor()
         executor.close()
 
     def _get_optimizer_status(self, op, param_name):
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index 212308159aabb123fde11543b3482f2232b4925d..f1911408c84a9dde56a8674e88e0fb8ad575cae7 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -15,4 +15,4 @@
 from .fs import *
 from .http_server import KVHandler, KVHTTPServer, KVServer
 
-__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
+#__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
index f885e51ef7f0d82ca50c7beb6ee6cd443dfc61d4..40cc2d2dd4e3823796451e5f335b7c4e765d5908 100644
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@@ -145,7 +145,7 @@ class Fleet(object):
 
         Returns:
             bool: True if this is a node of server,
-                  False if not.
+                  False if not
         """
         return self._role_maker.is_server()
 
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
index 7f8db694d3601be072ab30ffbbd345b25ffafd80..be27a7c5214e6b4b730d14cb4a64118f24506860 100644
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -343,7 +343,6 @@ class MPISymetricRoleMaker(MPIRoleMaker):
     def get_pserver_endpoints(self):
         """
         get pserver endpoints
-        
         Returns:
             endpoints(list): pserver endpoints
         """
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
index 1a7a82fbfac19b41e8b96c231ca74398f6b2214c..236cb458be4c6a07f768761b41464e64d4d53f77 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -467,7 +467,7 @@ class FleetTranspiler(Fleet):
         opts = public._get_optimize_ops(self._origin_main_program)
         for op in opts:
             if "Param" in op.input_names and \
-                            "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
+                    "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
                 return op
 
     def _save_dense_params(self, executor, dirname, context, main_program):
@@ -700,8 +700,8 @@ if you would like to save all variables in a
                 return False
 
             if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                            var.desc.type() == core.VarDesc.VarType.READER:
+                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                    var.desc.type() == core.VarDesc.VarType.READER:
                 return False
             return var.persistable
 
@@ -846,4 +846,4 @@ class ParameterServerOptimizer(DistributedOptimizer):
         fleet.compiled_config = compiled_config
         fleet.main_program, fleet.startup_program = \
             self._build_trainer_programs(compiled_config) if fleet.is_worker() \
-                else self._build_pserver_programs(compiled_config)
+            else self._build_pserver_programs(compiled_config)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8668e39bd4e2e9724d79352f805aa6e6d68e5c4
--- /dev/null
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import warnings
+
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+
+from paddle.fluid.transpiler.details.program_utils import delete_ops
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import find_heter_ops
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import create_heter_program
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import create_trainer_program
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import find_block_joints
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import find_op_input_output
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import get_vars_name_in_block
+
+
+def split_heter_worker_ops_pass(program, config):
+    """
+    split heter worker program from origin-program
+    1. find heter op (located on different device)
+    2. find input&output of every heter-block
+    3. create heter worker program, add listen&serv op
+    """
+    default_deveice = "cpu"
+    program, heter_ops, _, program_block_ops = find_heter_ops(program,
+                                                              default_deveice)
+    if len(heter_ops) == 0:
+        warnings.warn(
+            "Currently running in Heter Parameter Server mode, but no OP running on heterogeneous devices, Please check your code."
+        )
+        return program
+
+    current_device = "gpu"
+    if current_device not in heter_ops:
+        raise ValueError("Op which run on device {} not exist.".format(
+            current_device))
+
+    block_vars_detail = find_block_joints(program, program_block_ops, heter_ops)
+    heter_program = framework.Program()
+    create_heter_program(program, config, heter_program, heter_ops,
+                         block_vars_detail, current_device)
+    return heter_program
+
+
+def split_trainer_ops_pass(program, config):
+    """
+    split cpu-trainer program from origin-program
+    1. find heter op (located on different device)
+    2. find input&output of every heter-block
+    3. create cpu-trainer program, add send&recv op 
+    """
+    # Todo: support user define default_device (MrChengmo)
+    default_deveice = "cpu"
+    program, heter_ops, _, program_block_ops = find_heter_ops(program,
+                                                              default_deveice)
+    block_vars_detail = find_block_joints(program, program_block_ops, heter_ops)
+    create_trainer_program(program, config, heter_ops, block_vars_detail)
+    return program
+
+
+def delete_startup_useless_ops_var_pass(startup_program, main_program, config):
+    """
+    delete variable which not used in current main_program
+    """
+    # find all op and its var
+    vars_in_main_program = get_vars_name_in_block(main_program.global_block())
+
+    block_nums = startup_program.num_blocks
+    for block_index in range(1, block_nums):
+        current_block = startup_program.block(block_index)
+        # delete useless op
+        need_delete_op = []
+        for op in current_block.ops:
+            inputs, outputs = find_op_input_output(startup_program,
+                                                   current_block, op)
+            inputs += outputs
+            # Todo: delete some concat op
+            if list(set(inputs) & set(vars_in_main_program)) == None:
+                need_delete_op.append(op)
+        delete_ops(current_block, need_delete_op)
+
+        # delete useless var
+        for var in current_block.vars:
+            if var.name not in vars_in_main_program:
+                startup_program._remove_var(var.name)
+
+    return startup_program
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
index 20f8aa3bb100da5fa622a4578997420250a63e70..e3bda62fc4afd0963651cbf8d2b05e770ccba1f5 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
@@ -37,7 +37,7 @@ LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
 
 def _is_optimizer_op(op):
     if "Param" in op.input_names and \
-                    "LearningRate" in op.input_names:
+            "LearningRate" in op.input_names:
         return True
     return False
 
@@ -49,7 +49,7 @@ def _same_or_split_var(p_name, var_name):
 def _get_optimizer_input_shape(op_type, varkey, orig_shape, param_shape):
     """
     Returns the shape for optimizer inputs that need to be reshaped when
-    Param and Grad is split to multiple servers.
+    Param and Grad is split to multiple servers. 
     """
     # HACK(typhoonzero) : Should use functions of corresponding optimizer in
     # optimizer.py to get the shape, do not bind this in the transpiler.
@@ -542,7 +542,7 @@ def add_optimizer_pass(program, config):
             for _, op in enumerate(optimize_ops):
                 # optimizer is connected to itself
                 if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \
-                                op not in global_ops:
+                        op not in global_ops:
                     __append_optimize_op__(op, per_opt_block, grad_to_block_id,
                                            merged_var, lr_ops)
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index f9889997d9e38c98c4a736a62dbc72da7029f337..378c8fc23d7528766ca9eca062c87a4511e32b46 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -12,33 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Copyright(c) 2020 PaddlePaddle Authors.All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0(the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http:  // www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from __future__ import print_function
 from functools import reduce
 
 import collections
 import math
 import os
+import warnings
 
 import six
+import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.core import CommContext
+import paddle.fluid.framework as framework
 from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
 from paddle.fluid.incubate.fleet.parameter_server.ir import vars_metatools
 from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import RoundRobin, PSDispatcher
+from paddle.fluid.transpiler.details.program_utils import delete_ops
 
 OP_NAME_SCOPE = "op_namescope"
 CLIP_OP_NAME_SCOPE = "@CLIP"
@@ -58,8 +48,8 @@ def _get_lr_ops(program):
     for index, op in enumerate(program.global_block().ops):
         role_id = int(op.attr(RPC_OP_ROLE_ATTR_NAME))
         if role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) or \
-                        role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) | \
-                        int(OPT_OP_ROLE_ATTR_VALUE):
+                role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) | \
+                int(OPT_OP_ROLE_ATTR_VALUE):
             lr_ops.append(op)
     return lr_ops
 
@@ -122,9 +112,20 @@ class MergedVariable:
         self.offsets = offsets
 
 
+def Singleton(cls):
+    _instance = {}
+
+    def _singleton(*args, **kargs):
+        if cls not in _instance:
+            _instance[cls] = cls(*args, **kargs)
+        return _instance[cls]
+
+    return _singleton
+
+
+@Singleton
 class CompileTimeStrategy(object):
     def __init__(self, main_program, startup_program, strategy, role_maker):
-
         self.min_block_size = 8192
 
         self.origin_main_program = main_program
@@ -177,6 +178,12 @@ class CompileTimeStrategy(object):
     def get_ps_endpoints(self):
         return self.role_maker.get_pserver_endpoints()
 
+    def get_heter_worker_endpoints(self):
+        return self.role_maker._get_heter_worker_endpoints()
+
+    def get_heter_worker_endpoint(self):
+        return self.role_maker._get_heter_worker_endpoint()
+
     def get_origin_programs(self):
         return self.origin_main_program, self.origin_startup_program
 
@@ -810,6 +817,30 @@ class CompileTimeStrategy(object):
 
         return sparse_param_grads, dense_param_grads
 
+    def remove_var_pair_by_grad(self, var_name):
+
+        for index, pair in enumerate(self.merged_variables_pairs):
+            var = pair[0]
+            var_grad = pair[1]
+            if var_grad.merged_var.name == var_name:
+                del self.merged_variables_pairs[index]
+
+        for index, pair in enumerate(self.merged_dense_pairs):
+            var = pair[0]
+            var_grad = pair[1]
+            if var_grad.merged_var.name == var_name:
+                del self.merged_dense_pairs[index]
+                return
+
+        for index, pair in enumerate(self.merged_sparse_pairs):
+            var = pair[0]
+            var_grad = pair[1]
+            if var_grad.merged_var.name == var_name:
+                del self.merged_sparse_pairs[index]
+                return
+
+        print("Not find {} in self.merge_pairs".format(var_name))
+
 
 def _is_opt_role_op(op):
     # NOTE : depend on oprole to find out whether this op is for
@@ -817,7 +848,7 @@ def _is_opt_role_op(op):
     op_maker = core.op_proto_and_checker_maker
     optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
     if op_maker.kOpRoleAttrName() in op.attr_names and \
-                    int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+            int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
         return True
     return False
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index fe483bddd6a482a34431e17fee354f6a8f5d80b1..82e626dace18339d30f5b482be2e1c8e340a6c78 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 
 from __future__ import print_function
+import six
+import collections
+import warnings
+import math
 
+from functools import reduce
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
 
@@ -34,6 +40,10 @@ LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
 OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
 op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
+DEVICE_LIST = ["cpu", "gpu", "xpu"]
+COMMUNICATE_OPS_TYPE = ["send", "recv", "fetch_barrier", "send_barrier"]
+DEFAULT_DEVICE = 'cpu'
+
 
 def delete_optimizer_pass(program, config):
     def _delete_optimizer_op_and_vars(_program, optimize_ops):
@@ -250,7 +260,7 @@ def fake_init_ops_pass(program, config):
         return list(set(dist_varnames + sparse_varnames))
 
     def _fake_init_sparsetable(sparse_table_names):
-        #delete table init op
+        # delete table init op
         for table_name in sparse_table_names:
             table_var = program.global_block().vars[table_name]
             table_param_init_op = []
@@ -307,3 +317,871 @@ def delet_extra_optimizes_pass(program, config):
             program.global_block()._remove_var(var)
 
     return program
+
+
+def find_heter_ops(program, default_device="cpu"):
+    if default_device not in DEVICE_LIST:
+        raise ValueError("Given device {} is not in device list {}".format(
+            default_device, DEVICE_LIST))
+
+    def _is_heter_op(op, current_heter_device, default_device="cpu"):
+        heter_devices = list(DEVICE_LIST)
+        heter_devices.remove(default_device)
+        op_device = op.attr("op_device")
+        op_type = op.type
+        if op_device in heter_devices:
+            return True
+        elif op_type in COMMUNICATE_OPS_TYPE and current_heter_device != default_device:
+            # for distributed communciate ops: send & recv & barrier etc.
+            # Todo: need update this method
+            op._set_attr('op_device', current_heter_device)
+            return True
+        elif op_device == None or op_device == default_device:
+            op._set_attr('op_device', default_device)
+            return False
+        return False
+
+    def _is_same_device(op, pre_device, default_device="cpu"):
+        op_device = op.attr("op_device")
+        if op_device == pre_device:
+            return True
+        if pre_device == default_device:
+            return True
+        return False
+
+    def _append_heter_op(op, current_heter_block_ops, heter_ops):
+        op_device = op.attr("op_device")
+        if op_device not in heter_ops:
+            heter_ops[op_device] = {}
+        current_heter_block_ops.append(op)
+
+    origin_porgram = program.clone()
+    block = program.global_block()
+
+    program_block_ops = []
+    default_ops = {default_device: {}}
+    heter_ops = {}
+    block_index = 0
+    # heter_ops: {"gpu": {1:[op1, op2, ...], 2:[op1, op2, ...] }; "xpu": {3:[op1, op2, ...], 4:[op1, op2, ...] }}
+
+    current_heter_block_ops = []
+    current_default_block_ops = []
+    current_heter_device = default_device
+    is_heter = False
+    for op in block.ops:
+        if _is_heter_op(op, current_heter_device, default_device):
+            # for gpu/xpu-op
+            is_heter = True
+
+            # for cpu-op block append
+            if len(current_default_block_ops) > 1:
+                default_ops[default_device][
+                    block_index] = current_default_block_ops
+                program_block_ops.append(current_default_block_ops)
+                current_default_block_ops = []
+                block_index += 1
+
+            if _is_same_device(op, current_heter_device, default_device):
+                # for gpu-op, gpu-op -> gpu-op,...
+                current_heter_device = op.attr("op_device")
+                _append_heter_op(op, current_heter_block_ops, heter_ops)
+            else:
+                # for gpu-op -> xpu-op, ...
+                op_device = current_heter_block_ops[0].attr("op_device")
+                heter_ops[op_device][block_index] = current_heter_block_ops
+                program_block_ops.append(current_heter_block_ops)
+                block_index += 1
+                current_heter_block_ops = []
+                current_heter_device = op.attr("op_device")
+                _append_heter_op(op, current_heter_block_ops, heter_ops)
+
+        elif is_heter:
+            # for gpu/xpu-op -> cpu-op
+            op_device = current_heter_block_ops[0].attr("op_device")
+            heter_ops[op_device][block_index] = current_heter_block_ops
+            program_block_ops.append(current_heter_block_ops)
+            block_index += 1
+            current_heter_block_ops = []
+            current_heter_device = default_device
+            is_heter = False
+            current_default_block_ops.append(op)
+        else:
+            # for cpu-op
+            current_default_block_ops.append(op)
+
+    if current_default_block_ops != []:
+        default_ops[default_device][block_index] = current_default_block_ops
+        program_block_ops.append(current_default_block_ops)
+
+    if current_heter_block_ops != []:
+        op_device = current_heter_block_ops[0].attr("op_device")
+        heter_ops[op_device][block_index] = current_heter_block_ops
+        program_block_ops.append(current_heter_block_ops)
+
+    if len(heter_ops) == 0:
+        warnings.warn(
+            "No heterogeneous OP was found in your program , "
+            " please using fluid.device_guard() to run OPs on different device.")
+
+    total_heter_ops = 0
+    heter_blocks = 0
+    for device in heter_ops.keys():
+        heter_block_dict = heter_ops[device]
+        heter_blocks += len(heter_block_dict)
+        for _, heter_block in heter_block_dict.items():
+            total_heter_ops += len(heter_block)
+    print(
+        "There are {} OPs in your main_program, and contains {} heter-OPs which is made up of {} heter-blocks.".
+        format(len(block.ops), total_heter_ops, heter_blocks))
+    return origin_porgram, heter_ops, default_ops, program_block_ops
+
+
+def create_heter_program(program, config, heter_program, heter_ops,
+                         block_var_detail, current_device):
+    # add heter op
+    optimizer_block = []
+    grad_to_block_id = []
+    send_grad_var_list = []
+
+    pre_block_idx = heter_program.num_blocks - 1
+    for index, heter_block_ops in heter_ops[current_device].items():
+        heter_block = heter_program._create_block(pre_block_idx)
+        optimizer_block.append(heter_block)
+        for _, op in enumerate(heter_block_ops):
+            block_append_op(heter_program, program, heter_block, op)
+
+            # add relate variables
+            inputs = _get_input_map_from_op(program.global_block().vars, op)
+            add_vars_by_op_map(inputs, heter_program)
+
+            outputs = _get_output_map_from_op(program.global_block().vars, op)
+            add_vars_by_op_map(outputs, heter_program)
+
+        entrance_vars = block_var_detail[index]["entrance"]
+        add_vars_by_var_list(entrance_vars, program, heter_program)
+        exit_vars = block_var_detail[index]["exit"]
+        add_vars_by_var_list(exit_vars, program, heter_program)
+
+        comm_info = get_communicate_var_info(program, index, entrance_vars,
+                                             exit_vars)
+
+        grad_to_block_id.append(comm_info["block_input_var_name"] + ":" + str(
+            heter_block.idx))
+
+        # create slice op
+        first_op_index = 0
+
+        get_type_var_name = comm_info["input_var_reshape_name"][0].split(
+            ".input_reshape@Heter")[0]
+        get_type_var = heter_program.global_block().vars[get_type_var_name]
+
+        insert_recv_slice_op(
+            heter_program, heter_block, first_op_index,
+            comm_info["block_input_var_name"],
+            (-1, sum(comm_info["input_var_reshape_dim"])), get_type_var.dtype,
+            get_type_var.type, comm_info["input_var_reshape_name"], [
+                (-1, comm_info["input_var_reshape_dim"][i])
+                for i in range(len(comm_info["input_var_reshape_dim"]))
+            ])
+        first_op_index += len(comm_info["input_var_reshape_dim"])
+        # create reshape op
+        for i in range(len(comm_info["input_var_reshape_name"])):
+            var_name = entrance_vars[i]
+            insert_reshape_op(
+                heter_program,
+                heter_block,
+                first_op_index,
+                comm_info["input_var_reshape_name"][i],
+                var_name, )
+            first_op_index += 1
+
+        first_op_index = len(heter_block.ops)
+
+        # create send reshape op
+        for i in range(len(exit_vars)):
+            insert_reshape_op(heter_program, heter_block, first_op_index,
+                              exit_vars[i],
+                              comm_info["output_var_reshape_name"][i],
+                              [-1, comm_info["output_var_reshape_dim"][i]])
+            first_op_index += 1
+
+        # create send concat op
+        insert_send_concat_op(heter_program, heter_block, first_op_index,
+                              comm_info["output_var_reshape_name"],
+                              comm_info["block_output_var_name"],
+                              [-1, sum(comm_info["output_var_reshape_dim"])])
+        check_op_device(heter_block, current_device)
+        send_grad_var_list = send_grad_var_list + add_heter_send_op(
+            program, heter_program, heter_block, block_var_detail[index])
+
+    # add step conter
+    send_input_vars = []
+    dummy_output = []
+    trainer_id = config.get_role_id()
+    pserver_endpoints = config.get_ps_endpoints()
+    optimizer_block[-1].append_op(
+        type="send",
+        inputs={"X": send_input_vars},
+        outputs={"Out": dummy_output},
+        attrs={
+            "send_varnames": [STEP_COUNTER],
+            "merge_add": True,
+            "use_send_handler": False,
+            "endpoints": pserver_endpoints
+        })
+
+    # add info in listen&serv
+    attrs = {
+        "grad_to_block_id": grad_to_block_id,
+        "sparse_grad_to_param": None,
+        "lr_decay_block_id": None,
+        "dense_optimize_blocks": None,
+        "sparse_optimize_blocks": None,
+        "optimize_blocks": optimizer_block,
+
+        # runtime attribute
+        "endpoint": config.get_heter_worker_endpoint(),
+        "pserver_id": config.get_role_id(),
+        "Fanin": config.get_trainers(),
+        "distributed_mode": config.get_distributed_mode(),
+        "rpc_get_thread_num": 12,
+        "rpc_send_thread_num": 12,
+        "rpc_prefetch_thread_num": 12
+    }
+
+    # append the listen_and_serv op
+    heter_program.global_block().append_op(
+        type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs)
+
+    check_heter_compile_time_strategy(program, config, send_grad_var_list)
+
+
+def check_heter_compile_time_strategy(program, config, send_grad_var_list):
+    origin_grad_var_list = []
+    for _, var_grad in config.merged_variables_pairs:
+        origin_grad_var_list.append(var_grad.merged_var.name)
+
+    origin_grad_var_list = list(set(origin_grad_var_list))
+    send_grad_var_list = list(set(send_grad_var_list))
+    useless_grad_var_list = list(
+        set(origin_grad_var_list) - set(send_grad_var_list))
+
+    for useless_grad_var in useless_grad_var_list:
+        config.remove_var_pair_by_grad(useless_grad_var)
+
+
+def create_trainer_program(program, config, heter_ops, block_var_detail):
+    for device in heter_ops.keys():
+        for heter_block_index in sorted(heter_ops[device]):
+            replace_ops_by_communicate_op(program, config, heter_block_index,
+                                          heter_ops[device][heter_block_index],
+                                          block_var_detail)
+            remove_trainer_send_op(program, config, heter_block_index,
+                                   block_var_detail)
+    deleter_trainer_useless_var(program)
+    check_op_device(program.global_block(), DEFAULT_DEVICE)
+
+
+def replace_ops_by_communicate_op(program, config, heter_block_index, ops_list,
+                                  block_var_detail):
+    all_op = program.global_block().ops
+    start_op = ops_list[0]
+    first_op_idx = -1
+    for op in all_op:
+        if is_same_op(op, start_op):
+            first_op_idx = all_op.index(op)
+            break
+    assert first_op_idx != -1
+    delete_same_ops(program.global_block(), ops_list)
+
+    mode = config.get_distributed_mode()
+    heter_worker_endpoint = config.get_heter_worker_endpoint()
+    entrance_var = block_var_detail[heter_block_index]["entrance"]
+    exit_var = block_var_detail[heter_block_index]["exit"]
+
+    default_device_comm_info = get_communicate_var_info(
+        program, heter_block_index - 1,
+        block_var_detail[heter_block_index - 1]["entrance"],
+        block_var_detail[heter_block_index - 1]["exit"])
+    comm_info = get_communicate_var_info(program, heter_block_index,
+                                         entrance_var, exit_var)
+
+    # create reshape op
+    for i in range(len(entrance_var)):
+        insert_reshape_op(
+            program,
+            program.global_block(), first_op_idx, entrance_var[i],
+            default_device_comm_info["output_var_reshape_name"][i],
+            [-1, default_device_comm_info["output_var_reshape_dim"][i]])
+        first_op_idx += 1
+
+    # create concat op
+    insert_send_concat_op(
+        program,
+        program.global_block(), first_op_idx,
+        default_device_comm_info["output_var_reshape_name"],
+        default_device_comm_info["block_output_var_name"],
+        [-1, sum(default_device_comm_info["output_var_reshape_dim"])])
+    first_op_idx += 1
+
+    # create send op
+    send_input_vars = [
+        program.global_block().vars[default_device_comm_info[
+            "block_output_var_name"]]
+    ]
+
+    get_type_var_name = comm_info["output_var_reshape_name"][0].split(
+        ".output_reshape@Heter")[0]
+    get_type_var = program.global_block().vars[get_type_var_name]
+
+    program.global_block().create_var(
+        name=comm_info["block_output_var_name"],
+        shape=(-1, sum(comm_info["output_var_reshape_dim"])),
+        dtype=get_type_var.dtype,
+        type=get_type_var.type)
+
+    recv_vars = [
+        program.global_block().vars[comm_info["block_output_var_name"]]
+    ]
+
+    program.global_block()._insert_op(
+        index=first_op_idx,
+        type="send_and_recv",
+        inputs={"X": send_input_vars},
+        outputs={"Out": recv_vars},
+        attrs={
+            "send_var_name": default_device_comm_info["block_output_var_name"],
+            "recv_var_name": comm_info["block_output_var_name"],
+            "endpoint": heter_worker_endpoint,
+            "trainer_id": config.get_role_id(),
+            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+        })
+    first_op_idx += 1
+
+    # recv
+    # create slice op
+    insert_recv_slice_op(
+        program,
+        program.global_block(), first_op_idx,
+        comm_info["block_output_var_name"],
+        (-1, sum(comm_info["output_var_reshape_dim"])), get_type_var.dtype,
+        get_type_var.type, comm_info["output_var_reshape_name"], [
+            (-1, comm_info["output_var_reshape_dim"][i])
+            for i in range(len(comm_info["output_var_reshape_dim"]))
+        ])
+
+    first_op_idx += len(comm_info["output_var_reshape_dim"])
+
+    # create reshape op
+    for i in range(len(comm_info["output_var_reshape_name"])):
+        var_name = comm_info["output_var_reshape_name"][i].split(
+            ".output_reshape@Heter")[0]
+        insert_reshape_op(
+            program,
+            program.global_block(),
+            first_op_idx,
+            comm_info["output_var_reshape_name"][i],
+            var_name, )
+        first_op_idx += 1
+
+
+def remove_trainer_send_op(program, config, heter_block_index,
+                           block_var_detaile):
+    # if trainer do FF->BP->SEND, it has follow vars: var, var@GRAD
+    # if trainer only do SEND, it has one var: var@GRAD
+    # Delete Send op ,if trainer doesn't has pair var (var<->var@GRAD)
+    persistables = block_var_detaile[heter_block_index]["persistables"]
+    need_remove_send_op = []
+    need_remove_grad_var = []
+    for op in find_send_op(program):
+        input_list, _ = find_op_input_output(program,
+                                             program.global_block(), op)
+        for var_name in input_list:
+            origin_var_name = var_name.split("@GRAD")[0]
+            if origin_var_name in persistables:
+                need_remove_send_op.append(op)
+                need_remove_grad_var.append(var_name)
+    need_remove_send_op = list(set(need_remove_send_op))
+    delete_ops(program.global_block(), need_remove_send_op)
+    for grad_var_name in need_remove_grad_var:
+        config.remove_var_pair_by_grad(grad_var_name)
+
+
+def add_heter_send_op(program, heter_program, block, block_var_detail):
+    def _get_send_op_dict():
+        send_op_dict = {}
+        send_op_list = find_send_op(program)
+        for op in send_op_list:
+            input_list, _ = find_op_input_output(program,
+                                                 program.global_block(), op)
+            for var in input_list:
+                send_op_dict[var] = op
+        return send_op_dict
+
+    send_grad_var_list = []
+    send_op_dict = _get_send_op_dict()
+    for persistable_var in block_var_detail["persistables"]:
+        # check var_name ==  var@GRAD
+        if "@GRAD" not in persistable_var:
+            continue
+        if "GRAD" != persistable_var.split("@")[-1]:
+            continue
+        if persistable_var not in send_op_dict:
+            continue
+        block_append_op(program, heter_program, block,
+                        send_op_dict[persistable_var])
+        send_grad_var_list.append(persistable_var)
+    return send_grad_var_list
+
+
+def find_send_op(program):
+    send_op_list = []
+    for op in program.global_block().ops:
+        if op.type == "send":
+            send_op_list.append(op)
+    return send_op_list
+
+
+def get_communicate_var_info(program, block_index, entrance_var_list,
+                             exit_var_list):
+    input_var_reshape_dim = []
+    input_var_reshape_name = []
+    block_input_var_name = "joint_{}_{}@Heter".format(block_index - 1,
+                                                      block_index)
+    output_var_reshape_dim = []
+    output_var_reshape_name = []
+    block_output_var_name = "joint_{}_{}@Heter".format(block_index,
+                                                       block_index + 1)
+    entrance_var_list.sort()
+    exit_var_list.sort()
+    # input
+    # Heter_SERVER_BLOCK_index@JOINT_VAR -> slice -> var@Heter_SERVER_BLOCK@INPUT_RESHAPE_VAR -> reshape -> var
+    for name in entrance_var_list:
+        var = program.global_block().vars[name]
+        shape = var.shape
+        if len(shape) < 2 or shape[0] != -1:
+            raise ValueError(
+                "Variable {} not support heter training. its shape is {}".
+                format(name, shape))
+        recv_var_dim = -1 * reduce(lambda x, y: x * y, shape)
+        input_var_reshape_dim.append(recv_var_dim)
+        input_var_reshape_name.append("{}.input_reshape@Heter".format(name))
+
+    # output
+    # var -> reshape -> var@Heter_SERVER_BLOCK@INPUT_RESHAPE_VAR -> concat -> Heter_SERVER_BLOCK_index@JOINT_VAR
+    for var_name in exit_var_list:
+        var = program.global_block().vars[var_name]
+        shape = var.shape
+        if len(shape) < 2 or shape[0] != -1:
+            raise ValueError(
+                "Variable {} not support heter training. its shape is {}".
+                format(var_name, shape))
+        send_reshape_dim = -1 * reduce(lambda x, y: x * y, shape)
+        output_var_reshape_dim.append(send_reshape_dim)
+        output_var_reshape_name.append("{}.output_reshape@Heter".format(
+            var_name))
+
+    info = {
+        "input_var_reshape_dim": input_var_reshape_dim,
+        "input_var_reshape_name": input_var_reshape_name,
+        "block_input_var_name": block_input_var_name,
+        "output_var_reshape_dim": output_var_reshape_dim,
+        "output_var_reshape_name": output_var_reshape_name,
+        "block_output_var_name": block_output_var_name
+    }
+
+    return info
+
+
+def find_block_joints(program, program_block_ops_list, heter_ops):
+    block_var_detail = find_entrance_exit_private(program,
+                                                  program_block_ops_list)
+    block_var_detail = entrance_exit_check(program, program_block_ops_list,
+                                           block_var_detail, heter_ops)
+    block_var_detail = delete_block_useless_exit(
+        program, program_block_ops_list, block_var_detail)
+    return block_var_detail
+
+
+def find_entrance_exit_private(program, program_block_ops_list):
+    block_var_detail = []
+    persistables = []
+    for index, block_op_list in enumerate(program_block_ops_list):
+        block_input, block_output = find_ops_list_input_output(program,
+                                                               block_op_list)
+        persistables = screen_persistables(
+            program, block_input) + screen_persistables(program, block_output)
+        # find entrance & exit
+        block_private_vars = list(set(block_input) & set(block_output))
+        block_entrance = list(set(block_input) - set(block_private_vars))
+        block_exit = list(set(block_output) - set(block_private_vars))
+        detail = {
+            "entrance": block_entrance,
+            "exit": block_exit,
+            "private": block_private_vars,
+            "persistables": persistables
+        }
+        block_var_detail.append(detail)
+    return block_var_detail
+
+
+def entrance_exit_check(program, program_block_ops_list, block_var_detail,
+                        heter_ops):
+    for index in range(len(block_var_detail) - 1, -1, -1):
+        if index - 1 < 0:
+            break
+        previous_block_exit = block_var_detail[index - 1]["exit"]
+        previous_block_exit.sort()
+        current_block_entrance = block_var_detail[index]["entrance"]
+        current_block_entrance.sort()
+        if previous_block_exit == current_block_entrance:
+            continue
+        exist_vars = list(
+            set(previous_block_exit) & set(current_block_entrance))
+        need_add_vars = list(set(current_block_entrance) - set(exist_vars))
+        need_add_vars = find_need_var_from_previous_block(
+            need_add_vars, block_var_detail, index, heter_ops)
+
+        previous_block_private = block_var_detail[index - 1]["private"]
+        previous_block_entrance = block_var_detail[index - 1]["entrance"]
+        for var in need_add_vars:
+            if var not in previous_block_private and var not in previous_block_entrance:
+                previous_block_entrance.append(var)
+            previous_block_exit.append(var)
+    return block_var_detail
+
+
+def find_need_var_from_previous_block(need_add_vars, block_var_detail,
+                                      current_index, heter_ops):
+    # create index_device_map
+    index_device_map = {}
+    for index in range(len(block_var_detail)):
+        index_device_map[index] = DEFAULT_DEVICE
+    for device in heter_ops:
+        for index in heter_ops[device].keys():
+            index_device_map[index] = device
+
+    pre_index = current_index - 1
+    need_ignore_var = []
+
+    # if need_add_var in current device, no need communicate
+    for var in need_add_vars:
+        while (pre_index >= 0):
+            previous_block_private = block_var_detail[pre_index]["private"]
+            previous_block_exit = block_var_detail[pre_index]["exit"]
+            previous_block_entrance = block_var_detail[pre_index]["entrance"]
+            total_var = previous_block_private + previous_block_exit + previous_block_entrance
+            if var in total_var:
+                if index_device_map[current_index] == index_device_map[
+                        pre_index] and index_device_map[
+                            current_index] == DEFAULT_DEVICE:
+                    need_ignore_var.append(var)
+                    break
+            pre_index -= 1
+
+    need_add_vars = list(set(need_add_vars).difference(set(need_ignore_var)))
+    return need_add_vars
+
+
+def delete_block_useless_exit(program, program_block_ops_list,
+                              block_var_detail):
+    for index in range(len(block_var_detail)):
+        if index == len(block_var_detail) - 1:
+            break
+        current_block_exit = block_var_detail[index]["exit"]
+        next_block_entrance = block_var_detail[index + 1]["entrance"]
+        need_delete_var = []
+        for var in current_block_exit:
+            if var not in next_block_entrance:
+                need_delete_var.append(var)
+
+        for var in need_delete_var:
+            current_block_exit.remove(var)
+
+    return block_var_detail
+
+
+def check_op_device(block, device):
+    for op in block.ops:
+        op._set_attr('op_device', device)
+
+
+def screen_persistables(program, var_list):
+    need_remove = []
+    for var_name in var_list:
+        if "@GRAD" in var_name:
+            origin_var_name = var_name.split("@GRAD")[0]
+            var = program.global_block().vars[origin_var_name]
+        else:
+            var = program.global_block().vars[var_name]
+
+        if fluid.io.is_persistable(var):
+            need_remove.append(var_name)
+
+    for var_name in need_remove:
+        var_list.remove(var_name)
+    return need_remove
+
+
+def insert_reshape_op(program,
+                      block,
+                      index,
+                      var_name,
+                      new_var_name,
+                      new_var_shape=None):
+    input_var = program.global_block().vars[var_name]
+
+    if new_var_name not in program.global_block().vars:
+        out = program.global_block().create_var(
+            name=new_var_name,
+            shape=new_var_shape,
+            dtype=input_var.dtype,
+            type=input_var.type)
+    else:
+        out = program.global_block().vars[new_var_name]
+        new_var_shape = out.shape
+
+    x_shape = program.global_block().create_var(
+        name="{}.xshape@Heter".format(var_name), dtype=input_var.dtype)
+    block._insert_op(
+        index=index,
+        type="reshape2",
+        inputs={"X": input_var},
+        attrs={'shape': new_var_shape},
+        outputs={"Out": out,
+                 "XShape": x_shape})
+
+
+def insert_send_concat_op(program, block, index, var_name_list, new_var_name,
+                          new_var_shape):
+    input_var_list = [
+        program.global_block().vars[var_name] for var_name in var_name_list
+    ]
+
+    out = program.global_block().create_var(
+        name=new_var_name,
+        shape=new_var_shape,
+        dtype=input_var_list[0].dtype,
+        type=input_var_list[0].type)
+
+    block._insert_op(
+        index=index,
+        type='concat',
+        inputs={"X": input_var_list},
+        outputs={'Out': [out]},
+        attrs={'axis': -1,
+               'use_stack': False})
+
+
+def insert_recv_slice_op(program, block, index, var_name, var_shape, dtype,
+                         type, new_var_name_list, new_var_shape_list):
+
+    if var_name not in program.global_block().vars:
+        input_var = program.global_block().create_var(
+            name=var_name, shape=var_shape, dtype=dtype, type=type)
+    else:
+        input_var = program.global_block().vars[var_name]
+
+    out_list = []
+    for i in range(len(new_var_name_list)):
+        if new_var_name_list[i] not in program.global_block().vars:
+            out = program.global_block().create_var(
+                name=new_var_name_list[i],
+                shape=new_var_shape_list[i],
+                dtype=input_var.dtype,
+                type=input_var.type)
+        else:
+            out = program.global_block().vars[new_var_name_list[i]]
+        out_list.append(out)
+
+    start_index = 0
+    end_index = 0
+    for i in range(len(new_var_name_list)):
+        starts = []
+        ends = []
+        attrs = {'axes': [1]}
+        end_index += new_var_shape_list[i][1]
+        starts.append(start_index)
+        ends.append(end_index)
+        attrs['starts'] = starts
+        attrs['ends'] = ends
+
+        block._insert_op(
+            index=index,
+            type='slice',
+            inputs={'Input': input_var},
+            attrs=attrs,
+            outputs={'Out': out_list[i]})
+        start_index = end_index
+        index += 1
+
+
+def deleter_trainer_useless_var(program):
+    porgram_useful_var_list = []
+    for op in program.global_block().ops:
+        input_var_list, output_var_list = find_op_input_output(
+            program, program.global_block(), op)
+        op_var_list = list(set(input_var_list).union(set(output_var_list)))
+        porgram_useful_var_list = list(
+            set(porgram_useful_var_list).union(set(op_var_list)))
+
+    program_useless_var_list = list(
+        set(get_vars_name_in_block(program.global_block())).difference(
+            set(porgram_useful_var_list)))
+    for var in program_useless_var_list:
+        program.global_block()._remove_var(var)
+    return program_useless_var_list
+
+
+def block_append_op(program, origin_program, block, op):
+    inputs = _get_input_map_from_op(origin_program.global_block().vars, op)
+    for key, varlist in six.iteritems(inputs):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for var in varlist:
+            if var.name not in program.global_block().vars:
+                program.global_block()._clone_variable(var)
+
+    outputs = _get_output_map_from_op(origin_program.global_block().vars, op)
+    for key, varlist in six.iteritems(outputs):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for var in varlist:
+            if var.name not in program.global_block().vars:
+                program.global_block()._clone_variable(var)
+
+    if "_grad" not in op.type:
+        # for forward op
+        return block.append_op(
+            type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
+    else:
+        # for grad op
+        op_desc = op.desc
+        op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+        backward = core.op_proto_and_checker_maker.OpRole.Backward
+        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
+
+        # append grad op
+        new_op_desc = block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+        new_op_desc._set_attr(op_role_attr_name, backward)
+
+        # set device gard
+        if op.desc.has_attr(device_attr_name):
+            op_device = op_desc.attr(device_attr_name)
+            new_op_desc._set_attr(device_attr_name, op_device)
+        block._sync_with_cpp()
+
+
+def add_vars_by_op_map(var_map, program):
+    for key, varlist in six.iteritems(var_map):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for i in range(len(varlist)):
+            var = varlist[i]
+            if var.name not in program.global_block().vars:
+                program.global_block()._clone_variable(var)
+
+
+def add_vars_by_var_list(var_name_list, origin_program, program):
+    for var_name in var_name_list:
+        if var_name not in program.global_block().vars:
+            var = origin_program.global_block().vars[var_name]
+            program.global_block()._clone_variable(var)
+
+
+def get_varlist_from_op_map(var_map):
+    var_list = []
+    for key, varlist in six.iteritems(var_map):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for i in range(len(varlist)):
+            var = varlist[i]
+            var_list.append(var.name)
+    return var_list
+
+
+def find_ops_list_input_output(program, ops_list):
+    input_var_list = []
+    output_var_list = []
+    for op in ops_list:
+        inputs = _get_input_map_from_op(program.global_block().vars, op)
+        input_var_list += get_varlist_from_op_map(inputs)
+        outputs = _get_output_map_from_op(program.global_block().vars, op)
+        output_var_list += get_varlist_from_op_map(outputs)
+
+    input_var_list = list(set(input_var_list))
+    output_var_list = list(set(output_var_list))
+    return input_var_list, output_var_list
+
+
+def find_op_input_output(program, block, op):
+    input_var_list = []
+    output_var_list = []
+    inputs = _get_input_map_from_op(block.vars, op)
+    input_var_list += get_varlist_from_op_map(inputs)
+    outputs = _get_output_map_from_op(block.vars, op)
+    output_var_list += get_varlist_from_op_map(outputs)
+    input_var_list = list(set(input_var_list))
+    output_var_list = list(set(output_var_list))
+    return input_var_list, output_var_list
+
+
+def get_vars_name_in_block(block):
+    vars_list = block.vars.keys()
+    vars_name_list = [var_name for var_name in vars_list]
+    return vars_name_list
+
+
+def is_same_op(op1, op2):
+    if str(op1) != str(op2):
+        return False
+    return True
+
+
+def _get_input_map_from_op(varmap, op):
+    """Returns a dict from op input name to the vars in varmap."""
+    iomap = collections.OrderedDict()
+    for key in op.input_names:
+        vars = []
+        for varname in op.input(key):
+            if varname == "@EMPTY@":
+                continue
+            if "lod_tensor_blocking_queue" in varname:
+                continue
+            vars.append(varmap[varname])
+        if len(vars) == 1:
+            iomap[key] = vars[0]
+        else:
+            iomap[key] = vars
+    return iomap
+
+
+def _get_output_map_from_op(varmap, op):
+    """Returns a dict from op output name to the vars in varmap."""
+    iomap = collections.OrderedDict()
+    for key in op.output_names:
+        vars = []
+        for varname in op.output(key):
+            if varname == "@EMPTY@":
+                continue
+            if "lod_tensor_blocking_queue" in varname:
+                continue
+            vars.append(varmap[varname])
+        if len(vars) == 1:
+            iomap[key] = vars[0]
+        else:
+            iomap[key] = vars
+    return iomap
+
+
+def delete_same_ops(block, ops):
+    for op in ops:
+        try:
+            for origin_op in block.ops:
+                if is_same_op(origin_op, op):
+                    idx = list(block.ops).index(origin_op)
+                    block._remove_op(idx)
+                    break
+        except Exception as e:
+            print(e)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ec71e4c9912295ca0844ce91dd2e06e03d9a216d..19c46fd21b1cda5f6f3155250fb953ce9a962bb2 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1858,6 +1858,7 @@ def conv3d(input,
     return helper.append_activation(pre_act)
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.pool2d")
 @templatedoc()
 def pool2d(input,
            pool_size=-1,
@@ -2075,6 +2076,7 @@ def pool2d(input,
     return pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.pool3d")
 @templatedoc()
 def pool3d(input,
            pool_size=-1,
@@ -2303,6 +2305,7 @@ def pool3d(input,
     return pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.adaptive_pool2d")
 @templatedoc(op_type="pool2d")
 def adaptive_pool2d(input,
                     pool_size,
@@ -2450,6 +2453,7 @@ def adaptive_pool2d(input,
     return (pool_out, mask) if require_index else pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.adaptive_pool3d")
 @templatedoc(op_type="pool3d")
 def adaptive_pool3d(input,
                     pool_size,
@@ -10205,6 +10209,7 @@ def unstack(x, axis=0, num=None):
     return outs
 
 
+@deprecated(since='2.0.0', update_to="paddle.expand")
 def expand(x, expand_times, name=None):
     """
     :alias_main: paddle.expand
@@ -10312,6 +10317,7 @@ def expand(x, expand_times, name=None):
     return out
 
 
+@deprecated(since='2.0.0', update_to="paddle.expand_as")
 def expand_as(x, target_tensor, name=None):
     """
     :alias_main: paddle.expand_as
@@ -10377,6 +10383,9 @@ def expand_as(x, target_tensor, name=None):
         #(3,20)
 
     """
+    if in_dygraph_mode():
+        return core.ops.expand_as(x, target_tensor)
+
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int32', 'int64', 'bool'], 'expand_as')
     check_variable_and_dtype(target_tensor, 'target_tensor',
@@ -15004,6 +15013,7 @@ def gather_tree(ids, parents):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.uniform")
 @templatedoc()
 def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
                    name=None):
diff --git a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
index fe7513ae84238527d25cc28fa40b01f1f099f1c8..863c001f226f86384e2820cb6877ded48cffa119 100644
--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
@@ -17,8 +17,9 @@ from __future__ import print_function
 import os
 import logging
 import tarfile
-
+import tempfile
 import random
+import warnings
 
 import paddle
 import paddle.fluid.incubate.data_generator as data_generator
@@ -57,7 +58,7 @@ def load_dnn_input_record(sent):
 def load_lr_input_record(sent):
     res = []
     for _ in [x.split(':') for x in sent.split()]:
-        res.append(int(_[0]))
+        res.append(int(_[0]) % 10000)
     return res
 
 
@@ -120,9 +121,62 @@ def prepare_data():
     lr_input_dim = res[1]
     logger.info('dnn input dim: %d' % dnn_input_dim)
     logger.info('lr input dim: %d' % lr_input_dim)
+
     return dnn_input_dim, lr_input_dim, train_file_path
 
 
+def gen_fake_line(dnn_data_num=7,
+                  dnn_data_range=1e5,
+                  lr_data_num=5,
+                  lr_data_range=1e5):
+    line = ""
+
+    # for deep data
+    for index in range(dnn_data_num):
+        data = str(random.randint(0, dnn_data_range - 1))
+        if index < dnn_data_num - 1:
+            data += " "
+        line += data
+    line += "\t"
+
+    # for wide data
+    for index in range(lr_data_num):
+        data = str(random.randint(0, lr_data_range - 1)) + ":" + str(1)
+        if index < lr_data_num - 1:
+            data += " "
+        line += data
+    line += "\t"
+
+    # for label
+    line += str(random.randint(0, 1))
+    line += "\n"
+    return line
+
+
+def prepare_fake_data(file_nums=8, file_lines=1000):
+    """
+    Create fake data with same type as avazu_ctr_data
+    """
+    file_dir = tempfile.mkdtemp()
+    warnings.warn("Fake data write in {}".format(file_dir))
+    for file_index in range(file_nums):
+        with open(
+                os.path.join(file_dir,
+                             "ctr_train_data_part_{}".format(file_index)),
+                'w+') as fin:
+            file_str = ""
+            for line_index in range(file_lines):
+                file_str += gen_fake_line()
+            fin.write(file_str)
+            warnings.warn("Write done ctr_train_data_part_{}".format(
+                file_index))
+
+    file_list = [os.path.join(file_dir, x) for x in os.listdir(file_dir)]
+    assert len(file_list) == file_nums
+
+    return file_list
+
+
 if __name__ == "__main__":
     pairwise_reader = DatasetCtrReader()
     pairwise_reader.run_from_stdin()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
new file mode 100644
index 0000000000000000000000000000000000000000..0de898d6dde217ec6d5cdf53611f986f7b04863f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -0,0 +1,220 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Distribute CTR model for test fleet api
+"""
+
+from __future__ import print_function
+
+import shutil
+import tempfile
+import time
+
+import paddle
+import paddle.fluid as fluid
+import os
+import numpy as np
+
+import ctr_dataset_reader
+from test_dist_fleet_heter_base import runtime_main, FleetDistHeterRunnerBase
+from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
+from paddle.distributed.fleet.base.util_factory import fleet_util
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
+    """
+    For test CTR model, using Fleet api
+    """
+
+    def net(self, args, batch_size=4, lr=0.01):
+        """
+        network definition
+
+        Args:
+            batch_size(int): the size of mini-batch for training
+            lr(float): learning rate of training
+        Returns:
+            avg_cost: LoDTensor of cost.
+        """
+        dnn_input_dim, lr_input_dim = int(1e5), int(1e5)
+
+        dnn_data = fluid.layers.data(
+            name="dnn_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        lr_data = fluid.layers.data(
+            name="lr_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        label = fluid.layers.data(
+            name="click",
+            shape=[-1, 1],
+            dtype="float32",
+            lod_level=0,
+            append_batch_size=False)
+
+        datas = [dnn_data, lr_data, label]
+
+        if args.reader == "pyreader":
+            self.reader = fluid.io.PyReader(
+                feed_list=datas,
+                capacity=64,
+                iterable=False,
+                use_double_buffer=False)
+
+        # build dnn model
+        dnn_layer_dims = [128, 64, 32, 1]
+        dnn_embedding = fluid.layers.embedding(
+            is_distributed=False,
+            input=dnn_data,
+            size=[dnn_input_dim, dnn_layer_dims[0]],
+            param_attr=fluid.ParamAttr(
+                name="deep_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+        dnn_pool = fluid.layers.sequence_pool(
+            input=dnn_embedding, pool_type="sum")
+        dnn_out = dnn_pool
+
+        # build lr model
+        lr_embbding = fluid.layers.embedding(
+            is_distributed=False,
+            input=lr_data,
+            size=[lr_input_dim, 1],
+            param_attr=fluid.ParamAttr(
+                name="wide_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+        lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
+
+        with fluid.device_guard("gpu"):
+            for i, dim in enumerate(dnn_layer_dims[1:]):
+                fc = fluid.layers.fc(
+                    input=dnn_out,
+                    size=dim,
+                    act="relu",
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(value=0.01)),
+                    name='dnn-fc-%d' % i)
+                dnn_out = fc
+
+            merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
+            label = fluid.layers.cast(label, dtype="int64")
+            predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            fluid.layers.Print(avg_cost, message="avg_cost")
+
+        self.feeds = datas
+        self.train_file_path = ["fake1", "fake2"]
+        self.avg_cost = avg_cost
+        self.predict = predict
+
+        return avg_cost
+
+    def check_model_right(self, dirname):
+        model_filename = os.path.join(dirname, "__model__")
+
+        with open(model_filename, "rb") as f:
+            program_desc_str = f.read()
+
+        program = fluid.Program.parse_from_string(program_desc_str)
+        with open(os.path.join(dirname, "__model__.proto"), "w") as wn:
+            wn.write(str(program))
+
+    def do_pyreader_training(self, fleet):
+        """
+        do training using dataset, using fetch handler to catch variable
+        Args:
+            fleet(Fleet api): the fleet object of Parameter Server, define distribute training role
+        """
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        fleet.init_worker()
+        exe.run(fluid.default_startup_program())
+        batch_size = 4
+        train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
+        self.reader.decorate_sample_list_generator(train_reader)
+
+        for epoch_id in range(1):
+            self.reader.start()
+            try:
+                pass_start = time.time()
+                while True:
+                    exe.run(program=fluid.default_main_program())
+
+                pass_time = time.time() - pass_start
+            except fluid.core.EOFException:
+                self.reader.reset()
+
+        fleet.stop_worker()
+
+    def do_dataset_training(self, fleet):
+        train_file_list = ctr_dataset_reader.prepare_fake_data()
+
+        exe = fluid.Executor(fluid.CPUPlace())
+
+        fleet.init_worker()
+        exe.run(fluid.default_startup_program())
+
+        thread_num = 1
+        batch_size = 128
+        filelist = fleet_util.get_file_shard(train_file_list)
+        print("filelist: {}".format(filelist))
+
+        # config dataset
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
+        dataset.set_batch_size(batch_size)
+        dataset.set_use_var(self.feeds)
+        pipe_command = 'python ctr_dataset_reader.py'
+        dataset.set_pipe_command(pipe_command)
+
+        dataset.set_filelist(filelist)
+        dataset.set_thread(thread_num)
+
+        for epoch_id in range(1):
+            pass_start = time.time()
+            dataset.set_filelist(filelist)
+            exe.train_from_dataset(
+                program=fluid.default_main_program(),
+                dataset=dataset,
+                fetch_list=[self.avg_cost],
+                fetch_info=["cost"],
+                print_period=2,
+                debug=int(os.getenv("Debug", "0")))
+            pass_time = time.time() - pass_start
+            print("do_dataset_training done. using time {}".format(pass_time))
+        if os.getenv("SAVE_MODEL") == "1":
+            model_dir = tempfile.mkdtemp()
+            fleet.save_inference_model(exe, model_dir,
+                                       [feed.name for feed in self.feeds],
+                                       self.avg_cost)
+            self.check_model_right(model_dir)
+            shutil.rmtree(model_dir)
+
+        fleet.stop_worker()
+        print("do_dataset_training stop worker.")
+
+
+if __name__ == "__main__":
+    runtime_main(TestHeterPsCTR2x2)
diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
index 969a7da3b71b69296f3313342adbf989c60edb50..2c6c018b9dfac13d97c242e1f36adbddf9dbf3f1 100644
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
 
 
 class TestAdadeltaOp1(OpTest):
@@ -108,5 +110,54 @@ class TestAdadeltaOp2(OpTest):
         self.check_output()
 
 
+class TestAdadeltaV2(unittest.TestCase):
+    def test_adadelta_dygraph(self):
+        paddle.disable_static(paddle.CPUPlace())
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Adadelta(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adadelta(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.Adadelta(learning_rate=0.1)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(ValueError, paddle.optimizer.Adadelta, None)
+        self.assertRaises(
+            ValueError, paddle.optimizer.Adadelta, learning_rate=0.1, rho=None)
+        self.assertRaises(
+            ValueError,
+            paddle.optimizer.Adadelta,
+            learning_rate=0.1,
+            epsilon=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a135cea52903a0d896df2d446b58d99e5a18993
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def avg_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        field_size = (r_end - r_start) \
+            if (exclusive or adaptive) else (ksize[0])
+        if data_type == np.int8 or data_type == np.uint8:
+            out[:, :, i] = (np.rint(
+                np.sum(x_masked, axis=(2, 3)) / field_size)).astype(data_type)
+        else:
+            out[:, :, i] = (np.sum(x_masked, axis=(2)) /
+                            field_size).astype(data_type)
+    return out
+
+
+class TestPool1d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_adaptive_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def test_adaptive_avg_pool1d(self):
+        for place in self.places:
+            self.check_adaptive_avg_dygraph_results(place)
+            self.check_adaptive_avg_static_results(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..875fdf9e9c3f9a9b891ecc6911dfeda788eee271
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def max_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        out[:, :, i] = np.max(x_masked, axis=(2))
+    return out
+
+
+class TestPool1d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_adaptive_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def test_adaptive_max_pool1d(self):
+        for place in self.places:
+            self.check_adaptive_max_dygraph_results(place)
+            self.check_adaptive_max_static_results(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d78788eb1e7c63be485210780db25e1de6fd84b4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
+                            pool_type="max"):
+
+    N = x.shape[0]
+    C, H, W = [x.shape[1], x.shape[2], x.shape[3]] if data_format == 'NCHW' \
+        else [x.shape[3], x.shape[1], x.shape[2]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        output_size = [H_out, W_out]
+    else:
+        H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = H
+        H_out = H
+    if output_size[1] == None:
+        output_size[1] = W
+        W_out = W
+
+    out = np.zeros((N, C, H_out, W_out)) if data_format=='NCHW' \
+        else np.zeros((N, H_out, W_out, C))
+
+    for i in range(H_out):
+        in_h_start = adaptive_start_index(i, H, output_size[0])
+        in_h_end = adaptive_end_index(i, H, output_size[0])
+
+        for j in range(W_out):
+            in_w_start = adaptive_start_index(j, W, output_size[1])
+            in_w_end = adaptive_end_index(j, W, output_size[1])
+
+            if data_format == 'NCHW':
+                x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
+                elif pool_type == 'max':
+                    out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+            elif data_format == 'NHWC':
+                x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
+                elif pool_type == 'max':
+                    out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
+    return out
+
+
+class TestAdaptiveMaxPool2dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="max")
+        """
+        self.res_4_np = adaptive_pool2d_forward(
+            x=self.x_np,
+            output_size=[3, 3],
+            pool_type="max",
+            data_format="NHWC")
+        """
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[2, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool2d(
+            #    x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[None, 3])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, return_indices=False, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[2, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool2d(
+            #    x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[None, 3])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="max")
+
+        #self.res_4_np = adaptive_pool2d_forward(
+        #    x=self.x_np,
+        #    output_size=[3, 3],
+        #    pool_type="max",
+        #    data_format="NHWC")
+
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[2, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #    adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            #        output_size=[3, 3], data_format="NHWC")
+            #    out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_max_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[2, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            #    output_size=[3, 3], data_format="NHWC")
+            #out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_max_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
new file mode 100755
index 0000000000000000000000000000000000000000..a7de0a5c6a7017617124b893313e0f9830cc09f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
@@ -0,0 +1,293 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool3d_forward(x,
+                            output_size,
+                            adaptive=True,
+                            data_format='NCDHW',
+                            pool_type='max'):
+
+    N = x.shape[0]
+    C, D, H, W = [x.shape[1], x.shape[2], x.shape[3], x.shape[4]] \
+        if data_format == 'NCDHW' else [x.shape[4], x.shape[1], x.shape[2],x.shape[3]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        D_out = output_size
+        output_size = [D_out, H_out, W_out]
+    else:
+        D_out, H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = D
+        D_out = D
+    if output_size[1] == None:
+        output_size[1] = H
+        H_out = H
+    if output_size[2] == None:
+        output_size[2] = W
+        W_out = W
+
+    out = np.zeros((N, C, D_out, H_out, W_out)) if data_format=='NCDHW' \
+        else np.zeros((N, D_out, H_out, W_out, C))
+    for k in range(D_out):
+        d_start = adaptive_start_index(k, D, output_size[0])
+        d_end = adaptive_end_index(k, D, output_size[0])
+
+        for i in range(H_out):
+            h_start = adaptive_start_index(i, H, output_size[1])
+            h_end = adaptive_end_index(i, H, output_size[1])
+
+            for j in range(W_out):
+                w_start = adaptive_start_index(j, W, output_size[2])
+                w_end = adaptive_end_index(j, W, output_size[2])
+
+                if data_format == 'NCDHW':
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
+                                 w_end]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, :, k, i, j] = np.sum(x_masked,
+                                                    axis=(2, 3, 4)) / field_size
+                    elif pool_type == 'max':
+                        out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+
+                elif data_format == 'NDHWC':
+                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
+                                 w_end, :]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, k, i, j, :] = np.sum(x_masked,
+                                                    axis=(1, 2, 3)) / field_size
+                    elif pool_type == 'max':
+                        out[:, k, i, j, :] = np.max(x_masked, axis=(1, 2, 3))
+    return out
+
+
+class TestAdaptiveMaxPool3dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="max")
+
+        self.res_4_np = adaptive_pool3d_forward(
+            x=self.x_np,
+            output_size=[3, 3, 3],
+            pool_type="max",
+            data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool3d(
+            #    x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool3d(
+            #    x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="max")
+
+        # self.res_4_np = adaptive_pool3d_forward(
+        #     x=self.x_np,
+        #     output_size=[3, 3, 3],
+        #     pool_type="max",
+        #     data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            #         output_size=[3, 3, 3], data_format="NDHWC")
+            #     out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_max_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #     assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            #         output_size=[3, 3, 3], data_format="NDHWC")
+            #     out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_max_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #     assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index c6d3c6e7d0492b2f4a98a595f015e3b9f4a19e24..5c705378e515eec4c950f6996e2789df603fcda3 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -85,10 +85,35 @@ class TestBatchNorm(unittest.TestCase):
                     y = bn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
+            def compute_v3(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(1.0),
+                            trainable=False),
+                        bias_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(0.0),
+                            trainable=False),
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v4(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2d(
+                        shape[1], weight_attr=False, bias_attr=False)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
             x = np.random.randn(*shape).astype("float32")
             y1 = compute_v1(x, False, False)
             y2 = compute_v2(x)
+            y3 = compute_v3(x, False, False)
+            y4 = compute_v4(x)
             self.assertTrue(np.allclose(y1, y2))
+            self.assertTrue(np.allclose(y3, y4))
 
     def test_static(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 74c01e1424885051faf3e263e6ca26c1269a838e..2e1f9d41747e3a99b4b4a0650a52973459b85c7b 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -166,12 +166,16 @@ class TestClipAPI(unittest.TestCase):
         data_shape = [1, 9, 9, 4]
         data = np.random.random(data_shape).astype('float32')
         images = paddle.to_variable(data, dtype='float32')
+        v_min = paddle.to_variable(np.array([0.2], dtype=np.float32))
+        v_max = paddle.to_variable(np.array([0.8], dtype=np.float32))
 
         out_1 = paddle.clip(images, min=0.2, max=0.8)
         out_2 = paddle.clip(images, min=0.2, max=0.9)
+        out_3 = paddle.clip(images, min=v_min, max=v_max)
 
         self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
         self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
+        self.assertTrue(np.allclose(out_3.numpy(), data.clip(0.2, 0.8)))
 
     def test_errors(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d744c8299f484fd60a081adb1b3b9eb2834ddef
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
@@ -0,0 +1,388 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+"""
+    high level unit test for distribute fleet.
+"""
+
+import os
+import sys
+import subprocess
+
+import six
+import shutil
+import numpy as np
+import argparse
+from contextlib import closing
+import socket
+import time
+import tempfile
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.base.util_factory import fleet_util
+from paddle.distributed.fleet import fleet
+
+__all__ = ['FleetDistHeterRunnerBase', 'TestFleetHeterBase', 'runtime_main']
+
+RUN_STEP = 5
+LEARNING_RATE = 0.01
+DIST_UT_PORT = 0
+
+
+class FleetDistHeterRunnerBase(object):
+    """
+        run_pserver,run_trainer : after init role, using transpiler split program
+        net : implment by child class, the network of model
+        do training : exe run program
+    """
+
+    def build_role(self, args):
+        environs = {}
+        environs["PADDLE_PSERVERS_IP_PORT_LIST"] = args.endpoints
+        environs["PADDLE_TRAINER_ENDPOINTS"] = args.trainer_endpoints
+        environs[
+            "PADDLE_HETER_TRAINER_IP_PORT_LIST"] = args.heter_trainer_endpoints
+        environs["PADDLE_HETER_TRAINER_DEVICE"] = args.heter_trainer_device
+        environs["TRAINING_ROLE"] = args.role.upper()
+        environs["PADDLE_TRAINERS_NUM"] = args.trainers
+        environs["PADDLE_TRAINER_ID"] = args.current_id
+        if args.role.upper() == "PSERVER":
+            environs["POD_IP"] = args.endpoints.split(",")[int(
+                args.current_id)].split(":")[0]
+            environs["PADDLE_PORT"] = args.endpoints.split(",")[int(
+                args.current_id)].split(":")[1]
+        elif args.role.upper() == "HETER_TRAINER":
+            environs["POD_IP"] = args.heter_trainer_endpoints.split(",")[int(
+                args.current_id)].split(":")[0]
+            environs["PADDLE_PORT"] = args.heter_trainer_endpoints.split(",")[
+                int(args.current_id)].split(":")[1]
+            environs["FLAGS_selected_gpus"] = args.current_id
+
+        for k, v in environs.items():
+            os.environ[k] = str(v)
+
+        self.role = role_maker.PaddleCloudRoleMaker()
+        return self.role
+
+    def build_strategy(self, args):
+        self.strategy = paddle.distributed.fleet.DistributedStrategy()
+        self.strategy.a_sync = True
+
+        return self.strategy
+
+    def build_optimizer(self, avg_cost, strategy):
+        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+    def run_pserver(self, args):
+        fleet.init_server()
+        fleet.run_server()
+
+    def run_dataset_trainer(self, args):
+        out = self.do_dataset_training(fleet)
+
+    def run_pyreader_trainer(self, args):
+        out = self.do_pyreader_training(fleet)
+
+    def net(self, args, batch_size=4, lr=0.01):
+        raise NotImplementedError(
+            "get_model should be implemented by child classes.")
+
+    def do_dataset_training(self, fleet):
+        raise NotImplementedError(
+            "do_dataset_training should be implemented by child classes.")
+
+    def do_pyreader_training(self, fleet):
+        raise NotImplementedError(
+            "do_pyreader_training should be implemented by child classes.")
+
+
+class TestFleetHeterBase(unittest.TestCase):
+    """
+        start_pserver,start_trainer : add start cmd to test
+        run_cluster : using multi process to test distribute program
+    """
+
+    def _setup_config(self):
+        raise NotImplementedError("tests should have _setup_config implemented")
+
+    def tearDown(self):
+        t = time.time() - self.startTime
+        print('%s: %.3f' % (self.__class__.__name__, t))
+
+    def setUp(self):
+        self.startTime = time.time()
+
+        self._mode = "async"
+        self._reader = "pyreader"
+        self._trainers = 2
+        self._pservers = 2
+        self._port_set = set()
+
+        self._heter_device = "gpu"
+
+        global DIST_UT_PORT
+        if DIST_UT_PORT == 0 and os.getenv("PADDLE_DIST_UT_PORT"):
+            DIST_UT_PORT = int(os.getenv("PADDLE_DIST_UT_PORT"))
+
+        if DIST_UT_PORT:
+            print("set begin_port:", DIST_UT_PORT)
+            self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT, DIST_UT_PORT + 1)
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT + 2, DIST_UT_PORT + 3)
+            self._heter_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT + 4, DIST_UT_PORT + 5)
+            DIST_UT_PORT += 6
+        else:
+            self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())
+            self._heter_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())
+
+        self._python_interp = sys.executable
+        self._geo_sgd_need_push_nums = 5
+        self._grad_clip_mode = 0
+        self._setup_config()
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _start_pserver(self, cmd, required_envs):
+        ps0_cmd, ps1_cmd = cmd.format(0), cmd.format(1)
+
+        ps0_pipe = open(tempfile.gettempdir() + "/ps0_err.log", "wb+")
+        ps1_pipe = open(tempfile.gettempdir() + "/ps1_err.log", "wb+")
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps0_pipe,
+            env=required_envs)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps1_pipe,
+            env=required_envs)
+        return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
+
+    def _start_trainer(self, cmd, required_envs):
+        tr0_cmd, tr1_cmd = cmd.format(0), cmd.format(1)
+
+        tr0_pipe = open(tempfile.gettempdir() + "/tr0_err.log", "wb+")
+        tr1_pipe = open(tempfile.gettempdir() + "/tr1_err.log", "wb+")
+
+        tr0_out = open(tempfile.gettempdir() + "/tr0_out.log", "wb+")
+        tr1_out = open(tempfile.gettempdir() + "/tr1_out.log", "wb+")
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(" "),
+            stdout=tr0_out,
+            stderr=tr0_pipe,
+            env=required_envs)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.strip().split(" "),
+            stdout=tr1_out,
+            stderr=tr1_pipe,
+            env=required_envs)
+
+        return tr0_proc, tr1_proc, tr0_pipe, tr1_pipe
+
+    def _start_heter_trainer(self, cmd, required_envs):
+        heter0_cmd, heter1_cmd = cmd.format(0), cmd.format(1)
+
+        heter0_pipe = open(tempfile.gettempdir() + "/heter0_err.log", "wb+")
+        heter1_pipe = open(tempfile.gettempdir() + "/heter1_err.log", "wb+")
+        heter0_out = open(tempfile.gettempdir() + "/heter0_out.log", "wb+")
+        heter1_out = open(tempfile.gettempdir() + "/heter1_out.log", "wb+")
+
+        heter0_proc = subprocess.Popen(
+            heter0_cmd.strip().split(" "),
+            stdout=heter0_out,
+            stderr=heter0_pipe,
+            env=required_envs)
+        heter1_proc = subprocess.Popen(
+            heter1_cmd.strip().split(" "),
+            stdout=heter1_out,
+            stderr=heter1_pipe,
+            env=required_envs)
+
+        return heter0_proc, heter1_proc, heter0_pipe, heter1_pipe
+
+    def _run_cluster(self, model, envs):
+        env = {'GRAD_CLIP': str(self._grad_clip_mode)}
+        python_path = self._python_interp
+        gloo_path = tempfile.mkdtemp()
+
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
+            python_path += " -m coverage run --branch -p"
+        env.update(envs)
+
+        tr_cmd = "{0} {1} --role trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8} --heter_trainer_endpoints {9} --heter_trainer_device {10}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path, self._heter_endpoints, self._heter_device)
+
+        ps_cmd = "{0} {1} --role pserver --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8} --heter_trainer_endpoints {9} --heter_trainer_device {10}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path, self._heter_endpoints, self._heter_device)
+
+        heter_cmd = "{0} {1} --role heter_trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8} --heter_trainer_endpoints {9} --heter_trainer_device {10}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path, self._heter_endpoints, self._heter_device)
+
+        # Run dist train to compare with local results
+        ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
+        tr0, tr1, tr0_pipe, tr1_pipe = self._start_trainer(tr_cmd, env)
+        heter0, heter1, heter0_pipe, heter1_pipe = self._start_heter_trainer(
+            heter_cmd, env)
+
+        # Wait until trainer process terminate
+        while True:
+            stat0 = tr0.poll()
+            time.sleep(0.1)
+            if stat0 is not None:
+                break
+
+        while True:
+            stat1 = tr1.poll()
+            time.sleep(0.1)
+            if stat1 is not None:
+                break
+
+        tr0_out, tr0_err = tr0.communicate()
+        tr1_out, tr1_err = tr1.communicate()
+        print("tr end communicate")
+
+        tr0_ret = tr0.returncode
+        tr1_ret = tr0.returncode
+        print("tr get returncode: {}".format(tr0_ret))
+        if tr0_ret != 0:
+            print(
+                "========================Error tr0_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log"))
+            print(
+                "========================Error tr0_err end==========================="
+            )
+
+        if tr1_ret != 0:
+            print(
+                "========================Error tr1_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log"))
+            print(
+                "========================Error tr1_err end==========================="
+            )
+
+        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
+        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
+
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        ps0_pipe.close()
+        ps1_pipe.close()
+        heter0_pipe.close()
+        heter1_pipe.close()
+
+        ps0.terminate()
+        ps1.terminate()
+        heter0.terminate()
+        heter1.terminate()
+
+        shutil.rmtree(gloo_path)
+        return 0, 0
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": ""
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+
+def runtime_main(test_class):
+    parser = argparse.ArgumentParser(description='Run Fleet test.')
+    parser.add_argument(
+        '--role',
+        type=str,
+        required=True,
+        choices=['pserver', 'trainer', 'heter_trainer'])
+    parser.add_argument('--endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--trainer_endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--heter_trainer_endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--heter_trainer_device', type=str, required=False, default="gpu")
+    parser.add_argument('--gloo_path', type=str, required=False, default="")
+    parser.add_argument('--current_id', type=int, required=False, default=0)
+    parser.add_argument('--trainers', type=int, required=False, default=1)
+    parser.add_argument('--mode', type=str, required=False, default='async')
+    parser.add_argument(
+        '--geo_sgd_need_push_nums', type=int, required=False, default=2)
+    parser.add_argument('--reader', type=str, required=False, default='dataset')
+    args = parser.parse_args()
+
+    model = test_class()
+    role = model.build_role(args)
+    fleet.init(role)
+    strategy = model.build_strategy(args)
+    avg_cost = model.net(args)
+    model.build_optimizer(avg_cost, strategy)
+    fleet_util._set_strategy(strategy)
+    fleet_util._set_role_maker(role)
+
+    if args.role == "pserver" or args.role == "heter_trainer":
+        model.run_pserver(args)
+    else:
+        if args.reader == "dataset":
+            model.run_dataset_trainer(args)
+        else:
+            model.run_pyreader_trainer(args)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3ffd50dc8da16f4a19c8da5383fe7f763aa7a72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import tempfile
+from test_dist_fleet_heter_base import TestFleetHeterBase
+
+
+class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
+    def _setup_config(self):
+        self._mode = "async"
+        self._reader = "dataset"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "1"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "4"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_heter_ctr.py", delta=1e-5, check_error_log=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..3369039661205ef78a3ec0254241c3ed80b771a9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import paddle
+import os
+import math
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.base.util_factory import fleet_util
+from paddle.distributed.fleet import fleet
+
+
+class TestDistFleetHeterProgram(unittest.TestCase):
+    def build_role(self):
+        environs = {}
+        environs[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36012,127.0.0.1:36013"
+        environs["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36014,127.0.0.1:36015"
+        environs[
+            "PADDLE_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:36016,127.0.0.1:36017"
+        environs["PADDLE_HETER_TRAINER_DEVICE"] = "gpu"
+        environs["TRAINING_ROLE"] = "HETER_TRAINER"
+        environs["PADDLE_TRAINERS_NUM"] = 2
+        environs["PADDLE_TRAINER_ID"] = 0
+        environs["POD_IP"] = "127.0.0.1"
+        environs["PADDLE_PORT"] = "36016"
+        environs["FLAGS_selected_gpus"] = 0
+
+        for k, v in environs.items():
+            os.environ[k] = str(v)
+
+        self.role = role_maker.PaddleCloudRoleMaker()
+        return self.role
+
+    def build_strategy(self):
+        self.strategy = paddle.distributed.fleet.DistributedStrategy()
+        self.strategy.a_sync = True
+        return self.strategy
+
+    def build_input(self):
+        dense_input = fluid.layers.data(
+            name="dense_input", shape=[10], dtype="float32")
+
+        sparse_input_ids = [
+            fluid.layers.data(
+                name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
+            for i in range(1, 27)
+        ]
+
+        label = fluid.layers.data(name="label", shape=[1], dtype="float32")
+
+        inputs = [dense_input] + sparse_input_ids + [label]
+        return inputs
+
+    def build_net(self, inputs):
+        def embedding_layer(input):
+            return fluid.layers.embedding(
+                input=input,
+                is_sparse=True,
+                size=[100001, 10],
+                param_attr=fluid.ParamAttr(
+                    name="SparseFeatFactors",
+                    initializer=fluid.initializer.Uniform()), )
+
+        sparse_embed_seq = list(map(embedding_layer, inputs[1:-1]))
+
+        concated = fluid.layers.concat(sparse_embed_seq + inputs[0:1], axis=1)
+
+        with fluid.device_guard("gpu"):
+            fc1 = fluid.layers.fc(
+                input=concated,
+                size=400,
+                act="relu",
+                param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                    scale=1 / math.sqrt(concated.shape[1]))),
+                name="fc1")
+
+        with fluid.device_guard("cpu"):
+            fc2 = fluid.layers.fc(input=fc1,
+                                  size=400,
+                                  act="relu",
+                                  param_attr=fluid.ParamAttr(
+                                      initializer=fluid.initializer.Normal(
+                                          scale=1 / math.sqrt(fc1.shape[1]))),
+                                  name="fc2")
+
+        with fluid.device_guard("gpu"):
+            fc3 = fluid.layers.fc(input=fc2,
+                                  size=400,
+                                  act="relu",
+                                  param_attr=fluid.ParamAttr(
+                                      initializer=fluid.initializer.Normal(
+                                          scale=1 / math.sqrt(fc2.shape[1]))),
+                                  name="fc3")
+
+        with fluid.device_guard("cpu"):
+            predict = fluid.layers.fc(
+                input=fc3,
+                size=2,
+                act="softmax",
+                param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                    scale=1 / math.sqrt(fc3.shape[1]))), )
+
+        with fluid.device_guard("gpu"):
+            labels = fluid.layers.cast(inputs[-1], dtype="int64")
+            cost = fluid.layers.cross_entropy(input=predict, label=labels)
+            avg_cost = fluid.layers.reduce_sum(cost)
+
+        return avg_cost
+
+    def build_optimizer(self, avg_cost, strategy):
+        optimizer = fluid.optimizer.SGD(1e-2)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+    def test(self):
+        role = self.build_role()
+        fleet.init(role)
+        strategy = self.build_strategy()
+        inputs = self.build_input()
+        avg_cost = self.build_net(inputs)
+        self.build_optimizer(avg_cost, strategy)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_op.py
index 69ed9f141437c307dc9e43fb501000d5cafeeaf7..150aff78508c61031a97bb56c9f14c4485cecea1 100755
--- a/python/paddle/fluid/tests/unittests/test_expand_as_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_op.py
@@ -102,8 +102,23 @@ class TestExpandAsOpRank4(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+# Test dygraph API
+class TestExpandAsDygraphAPI(unittest.TestCase):
+    def test_api(self):
+        import paddle
+        paddle.disable_static()
+        np_data_x = np.array([1, 2, 3]).astype('int32')
+        np_data_y = np.array([1, 2, 3, 1, 2, 3]).astype('int32')
+        data_x = paddle.to_tensor(np_data_x)
+        data_y = paddle.to_tensor(np_data_y)
+        out = fluid.layers.expand_as(data_x, data_y)
+        np_out = out.numpy()
+        assert np.array_equal(np_out, np.tile(np_data_x, (2)))
+        paddle.enable_static()
+
+
 # Test python API
-class TestExpandAPI(unittest.TestCase):
+class TestExpandAsAPI(unittest.TestCase):
     def test_api(self):
         input1 = np.random.random([12, 14]).astype("float32")
         input2 = np.random.random([48, 14]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
index f5e888ab0eb3ca597bf62245ff9f3024fe81ee95..25801793f1f2e70c404727ed4f64c7d3c830aec9 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
@@ -43,7 +43,7 @@ class TestFleetBase(unittest.TestCase):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         strategy = fleet.DistributedStrategy()
-        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 9ab84404073906a8a95f9eb562cbe220e7c6b455..fc668ce3493e96e0790af522a439367fe10455f3 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -22,6 +22,7 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from op_test import OpTest
+import paddle
 
 
 class TestGaussianRandomOp(OpTest):
@@ -235,6 +236,56 @@ class TestGaussianRandomAPI(unittest.TestCase):
         self.assertAlmostEqual(np.mean(res_6), 0.0, delta=0.1)
         self.assertAlmostEqual(np.std(res_6), 1., delta=0.1)
 
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.gaussian_random([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.gaussian_random([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.gaussian_random([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+
+
+class TestStandardNormalDtype(unittest.TestCase):
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.standard_normal([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.standard_normal([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.standard_normal([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
index f7756e54168cc90311c443e7a768d4befa2ceda3..619e9e8e90783365b5f0d718783a14468520c8d4 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -658,7 +658,7 @@ class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
 class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
         optimizer = PipelineOptimizer(optimizer)
         return optimizer
 
@@ -670,7 +670,7 @@ class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
 class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
         optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5)
         return optimizer
 
@@ -682,7 +682,7 @@ class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
 class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
         optimizer = RecomputeOptimizer(optimizer)
         return optimizer
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index eb79a80da99fabd18ad9a8cbdd8f6aba08209d05..b76887f0965ca64b2b40bf9c0ce6e82b44fdad2f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -299,7 +299,7 @@ class TestLayer(LayerTest):
                 my_syncbn = paddle.nn.SyncBatchNorm(3)
                 dy_ret = my_syncbn(base.to_variable(t))
                 dy_ret_value = dy_ret.numpy()
-            self.assertTrue(np.array_equal(static_ret, static_ret))
+            self.assertTrue(np.array_equal(static_ret, dy_ret_value))
 
     def test_relu(self):
         with self.static_graph():
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index 77ec6f9b6bcda7568325698634fd4f86557cd1be..a535ef5e60397718e97100332b945b360838bbf4 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -19,6 +19,8 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
 
 
 class TestMomentumOp1(OpTest):
@@ -234,5 +236,48 @@ class TestSparseMomentumOp2(TestSparseMomentumOp):
         self.use_nesterov = True
 
 
+class TestMomentumV2(unittest.TestCase):
+    def test_momentum_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9, parameters=linear.parameters())
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_momentum(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.Momentum(
+                learning_rate=0.1, momentum=0.9)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(
+            ValueError, paddle.optimizer.Momentum, learning_rate=None)
+        self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index b1a25ad3529e8b0a4126bc458838ecd876e5af30..1c05b96f1fc61234028e940f6403ae08a0186027 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -174,66 +174,6 @@ class TestPool1d_API(unittest.TestCase):
             result = max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-    def check_adaptive_max_dygraph_results(self, place):
-        with fluid.dygraph.guard(place):
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            input = fluid.dygraph.to_variable(input_np)
-            result = F.adaptive_max_pool1d(input, output_size=16)
-
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
-                output_size=16)
-            result = ada_max_pool1d_dg(input)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-    def check_adaptive_avg_dygraph_results(self, place):
-        with fluid.dygraph.guard(place):
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            input = fluid.dygraph.to_variable(input_np)
-            result = F.adaptive_avg_pool1d(input, output_size=16)
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
-
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
-                output_size=16)
-            result = ada_max_pool1d_dg(input)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-    def check_adaptive_max_static_results(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
-            result = F.adaptive_max_pool1d(input, output_size=16)
-
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
-
-            exe = fluid.Executor(place)
-            fetches = exe.run(fluid.default_main_program(),
-                              feed={"input": input_np},
-                              fetch_list=[result])
-            self.assertTrue(np.allclose(fetches[0], result_np))
-
-    def check_adaptive_avg_static_results(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
-            result = F.adaptive_avg_pool1d(input, output_size=16)
-
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
-
-            exe = fluid.Executor(place)
-            fetches = exe.run(fluid.default_main_program(),
-                              feed={"input": input_np},
-                              fetch_list=[result])
-            self.assertTrue(np.allclose(fetches[0], result_np))
-
     def check_max_dygraph_padding_same(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
@@ -265,10 +205,6 @@ class TestPool1d_API(unittest.TestCase):
             self.check_avg_dygraph_results(place)
             self.check_max_static_results(place)
             self.check_avg_static_results(place)
-            self.check_adaptive_max_dygraph_results(place)
-            self.check_adaptive_avg_dygraph_results(place)
-            self.check_adaptive_max_static_results(place)
-            self.check_adaptive_avg_static_results(place)
             self.check_max_dygraph_padding_same(place)
             self.check_avg_dygraph_padding_same(place)
 
diff --git a/python/paddle/fluid/tests/unittests/test_rand_op.py b/python/paddle/fluid/tests/unittests/test_rand_op.py
index c8e0130b77dc661d190f568ac501c9986a81f5e4..1eceeaadfec651ade5031ddc7e6a012244050e84 100644
--- a/python/paddle/fluid/tests/unittests/test_rand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rand_op.py
@@ -21,6 +21,7 @@ import paddle.fluid.core as core
 from paddle import rand
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+import paddle
 
 
 class TestRandOpError(unittest.TestCase):
@@ -115,5 +116,31 @@ class TestRandOpForDygraph(unittest.TestCase):
             self.run_net(True)
 
 
+class TestRandDtype(unittest.TestCase):
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.rand([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.rand([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.rand([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index fb3fc8735566fcf601a7cb507e3826dd92a5651e..2c87e06e893a4d6495ad81ac3dcdf375a41272fb 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -20,6 +20,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from op_test import OpTest
+import paddle
 
 
 class TestSGDOp(OpTest):
@@ -208,5 +209,46 @@ class TestSGDOpWithLargeInput(unittest.TestCase):
         result = exe.run(compiled_prog, fetch_list=[avg_cost])
 
 
+class TestSGDV2(unittest.TestCase):
+    def test_sgd_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.SGD(learning_rate=0.01,
+                                    parameters=linear.parameters(),
+                                    weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_sgd(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(ValueError, paddle.optimizer.SGD, learning_rate=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index b0701a9b187f6c7cf63f43d69f482ea13e6d3fe3..09cd40d9cc59914c82cc343bb78b72fbc2b29e59 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -221,5 +221,21 @@ class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
             self.assertRaises(TypeError, my_sync_batch_norm, x2)
 
 
+class TestConvertSyncBatchNorm(unittest.TestCase):
+    def test_convert(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        with program_guard(Program(), Program()):
+            model = paddle.nn.Sequential(
+                paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
+            sync_model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            for idx, sublayer in enumerate(model.sublayers()):
+                if isinstance(sublayer, paddle.nn.BatchNorm2d):
+                    self.assertEqual(
+                        isinstance(sync_model[idx], paddle.nn.SyncBatchNorm),
+                        True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index a04aaaef0d41b9f991889586b489269b6ede5b42..56dc27a9a5b136829ce410b50998e23b77510665 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -536,5 +536,31 @@ class TestUniformDygraphMode(unittest.TestCase):
                 self.assertTrue((x_np[i] > 0 and x_np[i] < 1.0))
 
 
+class TestUniformDtype(unittest.TestCase):
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.uniform([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.uniform([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.uniform([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/incubate/hapi/model.py b/python/paddle/incubate/hapi/model.py
index e4a6b03f7aa5c7f537dc476d8f80162e530d5dbe..b52354d4ccf4671b0d372bae63a1befbe383e053 100644
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/incubate/hapi/model.py
@@ -891,33 +891,31 @@ class Model(object):
 
                 class Mnist(paddle.nn.Layer):
                     def __init__(self):
-                        super(MyNet, self).__init__()
-                        self._fc = Linear(784, 1, act='softmax')
+                        super(Mnist, self).__init__()
+                        self._fc = Linear(784, 10, act='softmax')
 
-                  @paddle.jit.to_static # If save for inference in dygraph, need this
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
+                    # If save for inference in dygraph, need this
+                    @paddle.jit.to_static
+                    def forward(self, x):
+                        y = self._fc(x)
+                        return y
 
-                dynamic = True # False
+                dynamic = True  # False
                 device = hapi.set_device('cpu')
                 # if use static graph, do not set
                 paddle.disable_static(device) if dynamic else None
-
                 # inputs and labels are not required for dynamic graph.
                 input = hapi.Input([None, 784], 'float32', 'x')
                 label = hapi.Input([None, 1], 'int64', 'label')
-
                 model = hapi.Model(Mnist(), input, label)
                 optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                    parameter_list=model.parameters())
-                model.prepare(optim,
-                                paddle.nn.CrossEntropyLoss(),
-                                hapi.metrics.Accuracy())
+                                            parameter_list=model.parameters())
+                model.prepare(optim, paddle.nn.CrossEntropyLoss())
                 mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
                 model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
-                model.save('checkpoint/test') # save for training
-                model.save('inference_model', False) # save for inference
+                model.save('checkpoint/test')  # save for training
+                model.save('inference_model', False)  # save for inference
+
         """
 
         if ParallelEnv().local_rank == 0:
@@ -1534,47 +1532,6 @@ class Model(object):
 
         Returns:
             list: The fetch variables' name list
-
-        Examples:
-        .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.static import InputSpec
-
-            import paddle.incubate.hapi as hapi
-            from paddle.nn import Linear
-            from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
-
-            class Mnist(Layer):
-                def __init__(self, classifier_act=None):
-                    super(Mnist, self).__init__()
-
-                    self.fc = Linear(input_dim=784, output_dim=10, act="softmax")
-
-                @paddle.jit.to_static # In static mode, you need to delete this.
-                def forward(self, inputs):
-                    outputs = self.fc(inputs)
-                    return outputs
-
-            dynamic = True # False
-            device = hapi.set_device('gpu')
-
-            # if use static graph, do not set
-            paddle.disable_static(device) if dynamic else None
-
-            # inputs and labels are not required for dynamic graph.
-            input = InputSpec([None, 784], 'float32', 'x')
-            label = InputSpec([None, 1], 'int64', 'label')
-
-            model = hapi.Model(Mnist(), input, label)
-            optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                parameter_list=model.parameters())
-            model.prepare(optim,
-                            paddle.nn.CrossEntropyLoss(),
-                            hapi.metrics.Accuracy())
-            mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
-            model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
-            model.save_inference_model('inference_model')
         """
 
         def get_inout_spec(all_vars, return_name=False):
@@ -1592,65 +1549,66 @@ class Model(object):
         #    the inputs of the model in running.
         # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
         if fluid.in_dygraph_mode():
-            layer = self.network
-            fluid.disable_dygraph()
-
-            # 1. input check
-            prog_translator = ProgramTranslator()
-            if not prog_translator.enable_declarative:
-                raise RuntimeError(
-                    "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
-                )
-            if not isinstance(layer, Layer):
-                raise TypeError(
-                    "The input layer should be 'Layer', but received layer type is %s."
-                    % type(layer))
-
-            # 2. get program of declarative Layer.forward
-            concrete_program = layer.forward.concrete_program
-
-            # NOTE: we maintain the mapping of variable name to
-            # structured name, the buffer variable (non-persistable)
-            # saved to inference program may not need by dygraph Layer,
-            # we only record the state_dict variable's structured name
-            state_names_dict = dict()
-            for structured_name, var in layer.state_dict().items():
-                state_names_dict[var.name] = structured_name
-
-            # 3. share parameters from Layer to scope & record var info
-            scope = core.Scope()
-            extra_var_info = dict()
-            for param_or_buffer in concrete_program.parameters:
-                # share to scope
-                param_or_buffer_tensor = scope.var(
-                    param_or_buffer.name).get_tensor()
-                src_tensor = param_or_buffer.value().get_tensor()
-                param_or_buffer_tensor._share_data_with(src_tensor)
-                # record var info
-                extra_info_dict = dict()
-                if param_or_buffer.name in state_names_dict:
-                    extra_info_dict['structured_name'] = state_names_dict[
-                        param_or_buffer.name]
-                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
-                if isinstance(param_or_buffer, ParamBase):
-                    extra_info_dict['trainable'] = param_or_buffer.trainable
-                extra_var_info[param_or_buffer.name] = extra_info_dict
-
-            # 4. build input & output spec
-            input_var_names = get_inout_spec(concrete_program.inputs, True)
-            output_vars = get_inout_spec(concrete_program.outputs)
-
-            # 5. save inference model
-            with scope_guard(scope):
-                return fluid.io.save_inference_model(
-                    dirname=save_dir,
-                    feeded_var_names=input_var_names,
-                    target_vars=output_vars,
-                    executor=Executor(_current_expected_place()),
-                    main_program=concrete_program.main_program.clone(),
-                    model_filename=model_filename,
-                    params_filename=params_filename,
-                    program_only=model_only)
+            with fluid.framework._dygraph_guard(None):
+                layer = self.network
+
+                # 1. input check
+                prog_translator = ProgramTranslator()
+                if not prog_translator.enable_declarative:
+                    raise RuntimeError(
+                        "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
+                    )
+                if not isinstance(layer, Layer):
+                    raise TypeError(
+                        "The input layer should be 'Layer', but received layer type is %s."
+                        % type(layer))
+
+                # 2. get program of declarative Layer.forward
+                concrete_program = layer.forward.concrete_program
+
+                # NOTE: we maintain the mapping of variable name to
+                # structured name, the buffer variable (non-persistable)
+                # saved to inference program may not need by dygraph Layer,
+                # we only record the state_dict variable's structured name
+                state_names_dict = dict()
+                for structured_name, var in layer.state_dict().items():
+                    state_names_dict[var.name] = structured_name
+
+                # 3. share parameters from Layer to scope & record var info
+                scope = core.Scope()
+                extra_var_info = dict()
+                for param_or_buffer in concrete_program.parameters:
+                    # share to scope
+                    param_or_buffer_tensor = scope.var(
+                        param_or_buffer.name).get_tensor()
+                    src_tensor = param_or_buffer.value().get_tensor()
+                    param_or_buffer_tensor._share_data_with(src_tensor)
+                    # record var info
+                    extra_info_dict = dict()
+                    if param_or_buffer.name in state_names_dict:
+                        extra_info_dict['structured_name'] = state_names_dict[
+                            param_or_buffer.name]
+                    extra_info_dict[
+                        'stop_gradient'] = param_or_buffer.stop_gradient
+                    if isinstance(param_or_buffer, ParamBase):
+                        extra_info_dict['trainable'] = param_or_buffer.trainable
+                    extra_var_info[param_or_buffer.name] = extra_info_dict
+
+                # 4. build input & output spec
+                input_var_names = get_inout_spec(concrete_program.inputs, True)
+                output_vars = get_inout_spec(concrete_program.outputs)
+
+                # 5. save inference model
+                with scope_guard(scope):
+                    return fluid.io.save_inference_model(
+                        dirname=save_dir,
+                        feeded_var_names=input_var_names,
+                        target_vars=output_vars,
+                        executor=Executor(_current_expected_place()),
+                        main_program=concrete_program.main_program.clone(),
+                        model_filename=model_filename,
+                        params_filename=params_filename,
+                        program_only=model_only)
 
         else:
             prog = self._adapter._progs.get('test', None)
diff --git a/python/paddle/incubate/hapi/tests/test_transforms.py b/python/paddle/incubate/hapi/tests/test_transforms.py
index 087f2d1615fc916d23464c1c4387b8f6befe6ac8..84208fda1e947f343de52a0a3c8de68322672013 100644
--- a/python/paddle/incubate/hapi/tests/test_transforms.py
+++ b/python/paddle/incubate/hapi/tests/test_transforms.py
@@ -64,6 +64,11 @@ class TestTransforms(unittest.TestCase):
 
         self.do_transform(trans)
 
+    def test_normalize(self):
+        normalize = transforms.Normalize(mean=0.5, std=0.5)
+        trans = transforms.Compose([transforms.Permute(mode='CHW'), normalize])
+        self.do_transform(trans)
+
     def test_trans_resize(self):
         trans = transforms.Compose([
             transforms.Resize(300, [0, 1]),
@@ -165,7 +170,7 @@ class TestTransforms(unittest.TestCase):
         fake_img = np.random.rand(500, 400, 3).astype('float32')
         fake_img_gray = trans_gray(fake_img)
 
-        np.testing.assert_equal(len(fake_img_gray.shape), 2)
+        np.testing.assert_equal(len(fake_img_gray.shape), 3)
         np.testing.assert_equal(fake_img_gray.shape[0], 500)
         np.testing.assert_equal(fake_img_gray.shape[1], 400)
 
diff --git a/python/paddle/incubate/hapi/vision/transforms/functional.py b/python/paddle/incubate/hapi/vision/transforms/functional.py
index f76aa6be8b4ddaf8b57278b32cf11d145350d772..b118ee3fc7553dc7d02028ae273be33166829635 100644
--- a/python/paddle/incubate/hapi/vision/transforms/functional.py
+++ b/python/paddle/incubate/hapi/vision/transforms/functional.py
@@ -16,6 +16,7 @@ import sys
 import collections
 import random
 import math
+import functools
 
 import cv2
 import numbers
@@ -31,6 +32,23 @@ else:
 __all__ = ['flip', 'resize', 'pad', 'rotate', 'to_grayscale']
 
 
+def keepdims(func):
+    """Keep the dimension of input images unchanged"""
+
+    @functools.wraps(func)
+    def wrapper(image, *args, **kwargs):
+        if len(image.shape) != 3:
+            raise ValueError("Expect image have 3 dims, but got {} dims".format(
+                len(image.shape)))
+        ret = func(image, *args, **kwargs)
+        if len(ret.shape) == 2:
+            ret = ret[:, :, np.newaxis]
+        return ret
+
+    return wrapper
+
+
+@keepdims
 def flip(image, code):
     """
     Accordding to the code (the type of flip), flip the input image
@@ -62,6 +80,7 @@ def flip(image, code):
     return cv2.flip(image, flipCode=code)
 
 
+@keepdims
 def resize(img, size, interpolation=cv2.INTER_LINEAR):
     """
     resize the input data to given size
@@ -103,6 +122,7 @@ def resize(img, size, interpolation=cv2.INTER_LINEAR):
         return cv2.resize(img, size[::-1], interpolation=interpolation)
 
 
+@keepdims
 def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
     """Pads the given CV Image on all sides with speficified padding mode and fill value.
 
@@ -193,6 +213,7 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
     return img
 
 
+@keepdims
 def rotate(img,
            angle,
            interpolation=cv2.INTER_LINEAR,
@@ -266,6 +287,7 @@ def rotate(img,
     return dst.astype(dtype)
 
 
+@keepdims
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
diff --git a/python/paddle/incubate/hapi/vision/transforms/transforms.py b/python/paddle/incubate/hapi/vision/transforms/transforms.py
index 90c6e279959b2133e5cc1184b981723b34c0b750..d46faa0685aa13790be217e0c99ab407790dd2ca 100644
--- a/python/paddle/incubate/hapi/vision/transforms/transforms.py
+++ b/python/paddle/incubate/hapi/vision/transforms/transforms.py
@@ -505,7 +505,7 @@ class Normalize(object):
             mean = [mean, mean, mean]
 
         if isinstance(std, numbers.Number):
-            mean = [std, std, std]
+            std = [std, std, std]
 
         self.mean = np.array(mean, dtype=np.float32).reshape(len(mean), 1, 1)
         self.std = np.array(std, dtype=np.float32).reshape(len(std), 1, 1)
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 76063458d44de3000ad7c1af08376c07e0209c27..5cc9f6d32f9d7ef3dafd73badd0ea88bed372968 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -97,8 +97,20 @@ from .layer.common import Dropout  #DEFINE_ALIAS
 from .layer.common import Dropout2D  #DEFINE_ALIAS
 from .layer.common import Dropout3D  #DEFINE_ALIAS
 from .layer.common import AlphaDropout  #DEFINE_ALIAS
+
+from .layer.pooling import AvgPool1d  #DEFINE_ALIAS
+from .layer.pooling import AvgPool2d  #DEFINE_ALIAS
+from .layer.pooling import AvgPool3d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool1d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool2d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool3d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
 from .layer.pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
 from .layer.pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+
+from .layer.pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveMaxPool2d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveMaxPool3d  #DEFINE_ALIAS
 from .layer.conv import Conv1d  #DEFINE_ALIAS
 from .layer.conv import Conv2d  #DEFINE_ALIAS
 from .layer.conv import Conv3d  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 414e70853eb7163230ab2db987fc19c58e168f19..3c0aa9c5c99e545b657559c30fcde46a69781231 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -170,22 +170,28 @@ from .norm import layer_norm  #DEFINE_ALIAS
 from .norm import lrn  #DEFINE_ALIAS
 from .norm import normalize  #DEFINE_ALIAS
 # from .norm import spectral_norm        #DEFINE_ALIAS
-from .pooling import max_pool1d  #DEFINE_ALIAS
-from .pooling import avg_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
 from .pooling import pool2d  #DEFINE_ALIAS
 from .pooling import pool3d  #DEFINE_ALIAS
+from .pooling import avg_pool1d  #DEFINE_ALIAS
 from .pooling import adaptive_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_pool3d  #DEFINE_ALIAS
-from .rnn import rnn  #DEFINE_ALIAS
-from .rnn import birnn  #DEFINE_ALIAS
 from .pooling import avg_pool2d  #DEFINE_ALIAS
-from .pooling import max_pool2d  #DEFINE_ALIAS
 from .pooling import avg_pool3d  #DEFINE_ALIAS
+from .pooling import max_pool1d  #DEFINE_ALIAS
+from .pooling import max_pool2d  #DEFINE_ALIAS
 from .pooling import max_pool3d  #DEFINE_ALIAS
+
+from .pooling import adaptive_pool2d  #DEFINE_ALIAS
+from .pooling import adaptive_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool2d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
 from .pooling import adaptive_avg_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_avg_pool3d  #DEFINE_ALIAS
+
+from .rnn import rnn  #DEFINE_ALIAS
+from .rnn import birnn  #DEFINE_ALIAS
 # from .rnn import gru_unit        #DEFINE_ALIAS
 # from .rnn import lstm        #DEFINE_ALIAS
 # from .rnn import lstm_unit        #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index f80f200c7163836252faa4b1c932178f6bab0dff..42d7d98aefcbbf51f562b98c4c494aeccfe20cf2 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -158,7 +158,7 @@ def conv1d(x,
         bias (Tensor, optional): The bias with shape [M,]. Default: None.
         stride (int or tuple, optional): The stride size. If stride is a tuple, it must
             contain one integers, (stride_size). Default: 1.
-        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+        padding(int|str|tuple|list, optional): The padding size. Padding could be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means the feature map is zero paded by size of `padding` on both sides.
             3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
@@ -185,7 +185,7 @@ def conv1d(x,
         same with input.
 
     Raises:
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `data_format` is not "NCL" or "NLC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
@@ -238,7 +238,7 @@ def conv1d(x,
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -260,7 +260,7 @@ def conv1d(x,
         padding = padding + [0]
     else:
         raise ValueError(
-            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            "The size of padding's dimension should be 1 or 2. But got padding={}".
             format(padding))
 
     stride = utils.convert_to_list(stride, 1, 'stride') + [1]
@@ -350,7 +350,7 @@ def conv2d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -377,7 +377,7 @@ def conv2d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
             H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
             W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
@@ -424,7 +424,7 @@ def conv2d(x,
 
     Raises:
         ValueError: If `data_format` is not "NCHW" or "NHWC".
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
@@ -465,7 +465,7 @@ def conv2d(x,
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -710,7 +710,7 @@ def conv_transpose1d(x,
 
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -728,7 +728,7 @@ def conv_transpose1d(x,
         padding = padding + [0]
     else:
         raise ValueError(
-            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            "The size of padding's dimension should 1 or 2. But got padding={}".
             format(padding))
 
     stride = utils.convert_to_list(stride, 1, 'stride') + [1]
@@ -807,10 +807,10 @@ def conv_transpose2d(x,
                      stride=1,
                      padding=0,
                      output_padding=0,
-                     groups=1,
                      dilation=1,
-                     data_format='NCHW',
+                     groups=1,
                      output_size=None,
+                     data_format='NCHW',
                      name=None):
     """
 
@@ -829,7 +829,7 @@ def conv_transpose2d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -856,7 +856,7 @@ def conv_transpose2d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
            H^\prime_{out} &= (H_{in} - 1) * strides[0] - pad_height_top - pad_height_bottom + dilations[0] * (H_f - 1) + 1 \\\\
            W^\prime_{out} &= (W_{in} - 1) * strides[1] - pad_width_left - pad_width_right + dilations[1] * (W_f - 1) + 1 \\\\
@@ -883,28 +883,27 @@ def conv_transpose2d(x,
         stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
             If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
-             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
-             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
-             If `padding` is a tuple or list, it could be in three forms:
-             `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and
-            when `data_format` is `'NCHW'`,
-            `padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `'NHWC'`, `padding` can be in the form
+        padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or 
+            'SAME' which is the padding algorithm. If padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or 
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCHW"`, `pool_padding` can be in the form 
+            `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form 
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
-            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups = 1.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
+            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
             tuple, it must contain two integers, (image_height, image_width). None if use
             filter_size, padding, and stride to calculate output_size.
@@ -950,7 +949,7 @@ def conv_transpose2d(x,
           paddle.disable_static()
           x_var = paddle.to_tensor(x)
           w_var = paddle.to_tensor(w)
-          y_var = F.conv2d_transpose(x_var, w_var)
+          y_var = F.conv_transpose2d(x_var, w_var)
           y_np = y_var.numpy()
           print(y_np.shape)
 
@@ -966,7 +965,7 @@ def conv_transpose2d(x,
     channel_dim = -1 if channel_last else 1
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -1070,7 +1069,7 @@ def conv3d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -1096,7 +1095,7 @@ def conv3d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
             D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
             H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
@@ -1147,7 +1146,7 @@ def conv3d(x,
 
     Raises:
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
@@ -1160,20 +1159,18 @@ def conv3d(x,
     Examples:
         .. code-block:: python
 
-            from paddle import fluid
-            import paddle.nn.functional as F
-            import paddle.fluid.dygraph as dg
             import numpy as np
+            import paddle
+            import paddle.nn.functional as F
 
             x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
             w = np.random.randn(6, 3, 3, 3, 3).astype(np.float32)
 
-            place = fluid.CPUPlace()
-            with dg.guard(place):
-                x_var = dg.to_variable(x)
-                w_var = dg.to_variable(w)
-                y_var = F.conv3d(x_var, w_var, act="relu")
-                y_np = y_var.numpy()
+            paddle.disable_static()
+            x_var = paddle.to_tensor(x)
+            w_var = paddle.to_tensor(w)
+            y_var = F.conv3d(x_var, w_var)
+            y_np = y_var.numpy()
             print(y_np.shape)
 
             # (2, 6, 6, 6, 6)
@@ -1190,7 +1187,7 @@ def conv3d(x,
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimmention of the input({}) should be defined. "
+            "The channel dimension of the input({}) should be defined. "
             "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
@@ -1260,8 +1257,8 @@ def conv_transpose3d(x,
                      output_padding=0,
                      groups=1,
                      dilation=1,
-                     data_format='NCDHW',
                      output_size=None,
+                     data_format='NCDHW',
                      name=None):
     """
     The convolution3d transpose layer calculates the output based on the input,
@@ -1279,7 +1276,7 @@ def conv_transpose3d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -1306,7 +1303,7 @@ def conv_transpose3d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
            D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
            H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
@@ -1338,37 +1335,37 @@ def conv_transpose3d(x,
             If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
             stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
             Default: stride = 1.
-        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively
-             adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
-             either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
-             is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `'NCDHW'`, `padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
             `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `'NDHWC'`, `padding` can be in the form
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
-            Default: dilation = 1.
         groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups=1
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
-            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            Default: dilation = 1.
         output_size(int|list|tuple, optional): The output image size. If output size is a
             tuple, it must contain three integers, (image_depth, image_height, image_width). This
             parameter only works when filter_size is None. If output_size and filter_size are 
             specified at the same time, They should follow the formula above. Default: None. 
             Output_size and filter_size should not be None at the same time.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default.
@@ -1425,7 +1422,7 @@ def conv_transpose3d(x,
     num_filters = weight.shape[1]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimmention of the input({}) should be defined. "
+            "The channel dimension of the input({}) should be defined. "
             "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 55bb36d136405385a88b991576c2a9091437d456..f1509143f3c933db12fc4ab6afd1a00b291f38f4 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -784,30 +784,30 @@ def kl_div(input, label, reduction='mean', name=None):
             import numpy as np
             import paddle.nn.functional as F
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
             shape = (5, 20)
             input = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
 
             # 'batchmean' reduction, loss shape will be [N]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='batchmean')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='batchmean')
             # shape=[5]
 
             # 'mean' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='mean')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='mean')
             # shape=[1]
 
             # 'sum' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='sum')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='sum')
             # shape=[1]
 
             # 'none' reduction, loss shape is same with input shape
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='none')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='none')
             # shape=[5, 20]
 
     """
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index ca657b8be3e67c7acb795a0f427ca5fe2c57b1f2..c8790a75901fd5d9a38862158246e3756dc575c4 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -18,124 +18,146 @@ from ...fluid.layers import pool3d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool2d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool3d  #DEFINE_ALIAS
 from ...fluid import core
-from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
-from ...fluid.layers import utils, LayerHelper
-from ...fluid.data_feeder import check_type, check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from ...fluid.layers import unsqueeze, squeeze
+from ...fluid.framework import in_dygraph_mode
+from ...fluid.layers import utils, LayerHelper, unsqueeze, squeeze
+from ...fluid.data_feeder import check_type, check_variable_and_dtype
 
 __all__ = [
     'pool2d',
     'pool3d',
+    'adaptive_pool2d',
+    'adaptive_pool3d',
     'avg_pool1d',
+    'avg_pool2d',
+    'avg_pool3d',
     'max_pool1d',
+    'max_pool2d',
+    'max_pool3d',
     'adaptive_avg_pool1d',
-    'adaptive_max_pool1d',
     'adaptive_avg_pool2d',
     'adaptive_avg_pool3d',
-    'adaptive_pool2d',
-    'adaptive_pool3d',
-    'max_pool2d',
-    'avg_pool2d',
-    'max_pool3d',
-    'avg_pool3d',
+    'adaptive_max_pool1d',
+    'adaptive_max_pool2d',
+    'adaptive_max_pool3d',
 ]
 
 
-def check_input(x, dimension):
+def _is_list_or_tuple(input):
+    return isinstance(input, (list, tuple))
+
+
+def _check_input(x, dimension):
     if len(x.shape) != dimension:
-        raise ValueError("Excepted Input X is 3-D tensor, but received {}-D {}".
-                         format(len(x.shape), type(x)))
+        raise ValueError(
+            "Excepted Input X is {}-D tensor, but received {}-D {}".format(
+                dimension, len(x.shape), type(x)))
 
 
-def check_instance(x, x_name, types=(int, float)):
+def _check_instance(x, x_name, types=(int, float)):
 
     if not isinstance(x, types):
         raise ValueError("Excepted {} type for {} but received type: {}. ".
                          format(types, x_name, type(x)))
 
 
-def update_padding1d(padding, pool_type='avg'):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, list) or isinstance(ele, tuple):
-            return True
-        return False
-
-    if is_list_or_tuple(padding):
-        if padding.__len__() == 1 and not is_list_or_tuple(padding[0]):
-            return [0, padding[0]]
-        else:
-            raise ValueError(
-                "{}_pool1d() argument 'padding' should contain one int (got {})".
-                format(pool_type, padding.__len__()))
+def _zero_padding_in_batch_and_channel(padding, channel_last):
+    if channel_last:
+        return list(padding[0]) == [0, 0] and list(padding[-1]) == [0, 0]
     else:
-        padding = [0, padding]
+        return list(padding[0]) == [0, 0] and list(padding[1]) == [0, 0]
 
-    return padding
 
+def _exclude_padding_in_batch_and_channel(padding, channel_last):
+    padding_ = padding[1:-1] if channel_last else padding[2:]
+    padding_ = [elem for pad_a_dim in padding_ for elem in pad_a_dim]
+    return padding_
 
-def update_padding2d(padding, data_format):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, list) or isinstance(ele, tuple):
-            return True
-        return False
-
-    if is_list_or_tuple(padding) and len(padding) == 4:
-        if is_list_or_tuple(padding[0]) and (data_format == "NCHW"):
-            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
-                raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[2:4]
-            padding = [ele for a_list in padding for ele in a_list]
-        elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
-            if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
-                raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[1:3]
-            padding = [ele for a_list in padding for ele in a_list]
-        padding = utils.convert_to_list(padding, 4, 'padding')
-
-        if utils._is_symmetric_padding(padding, 2):
-            padding = [padding[0], padding[2]]
-    else:
-        padding = utils.convert_to_list(padding, 2, 'padding')
-
-    return padding
 
+def _channel_last(data_format, num_dims):
+    if num_dims == 1:
+        if data_format not in ['NCL', 'NLC']:
+            raise ValueError(
+                "Attr(data_format) should be 'NCL' or 'NLC'. Received "
+                "Attr(data_format): %s" % str(data_format))
+        else:
+            return True if data_format == "NLC" else False
+    if num_dims == 2:
+        if data_format not in ['NCHW', 'NHWC']:
+            raise ValueError(
+                "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+                "Attr(data_format): %s" % str(data_format))
+        else:
+            return True if data_format == "NHWC" else False
+    if num_dims == 3:
+        if data_format not in ['NCDHW', 'NDHWC']:
+            raise ValueError(
+                "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+                "Attr(data_format): %s" % str(data_format))
+        else:
+            return True if data_format == "NDHWC" else False
 
-def update_padding3d(padding, data_format):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, (list, tuple)):
-            return True
-        return False
 
-    if is_list_or_tuple(padding) and len(padding) == 5:
-        if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"):
-            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
+                format(padding))
+        if padding == "VALID":
+            if ceil_mode != False:
                 raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[2:5]
-            padding = [ele for a_list in padding for ele in a_list]
-        elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
-            if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+
+            padding_algorithm = "VALID"
+            padding = [0] * num_dims
+        else:
+            padding_algorithm = "SAME"
+            padding = [0] * num_dims
+    elif _is_list_or_tuple(padding):
+        # for padding like
+        # [(pad_before, pad_after), (pad_before, pad_after), ...]
+        # padding for batch_dim and channel_dim included
+        if len(padding) == 2 + num_dims and _is_list_or_tuple(padding[0]):
+            if not _zero_padding_in_batch_and_channel(padding, channel_last):
                 raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[1:4]
-            padding = [ele for a_list in padding for ele in a_list]
-        padding = utils.convert_to_list(padding, 6, 'padding')
-        if utils._is_symmetric_padding(padding, 3):
-            padding = [padding[0], padding[2], padding[4]]
-
-    elif is_list_or_tuple(padding) and len(padding) == 6:
-        padding = utils.convert_to_list(padding, 6, 'padding')
-        if utils._is_symmetric_padding(padding, 3):
-            padding = [padding[0], padding[2], padding[4]]
+                    "Non-zero padding({}) in the batch or channel dimensions "
+                    "is not supported.".format(padding))
+            padding_algorithm = "EXPLICIT"
+            padding = _exclude_padding_in_batch_and_channel(padding,
+                                                            channel_last)
+            if utils._is_symmetric_padding(padding, num_dims):
+                padding = padding[0::2]
+        # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
+        elif len(padding) == 2 * num_dims and isinstance(padding[0], int):
+            padding_algorithm = "EXPLICIT"
+            padding = utils.convert_to_list(padding, 2 * num_dims, 'padding')
+            if utils._is_symmetric_padding(padding, num_dims):
+                padding = padding[0::2]
+        # for padding like [pad_d1, pad_d2, ...]
+        elif len(padding) == num_dims and isinstance(padding[0], int):
+            padding_algorithm = "EXPLICIT"
+            padding = utils.convert_to_list(padding, num_dims, 'padding')
+        else:
+            raise ValueError("Invalid padding: {}".format(padding))
+    # for integer padding
     else:
-        padding = utils.convert_to_list(padding, 3, 'padding')
+        padding_algorithm = "EXPLICIT"
+        padding = utils.convert_to_list(padding, num_dims, 'padding')
+    return padding, padding_algorithm
+
 
+def _expand_low_nd_padding(padding):
+    #1d to 2d fake input
+    if len(padding) == 2:
+        padding = [0] * 2 + padding
+    elif len(padding) == 1:
+        padding = [0] + padding
+    else:
+        raise ValueError(
+            "The size of padding's dimmention should be 1 or 2. But got padding={}".
+            format(padding))
     return padding
 
 
@@ -146,73 +168,57 @@ def avg_pool1d(x,
                count_include_pad=True,
                ceil_mode=False,
                name=None):
-    """
-
-    This operation applies a 1D average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
-
-    ..  math::
-
-       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
-
+    """ 
+    This API implements average pooling 1d operation,
+    See more details in :ref:`api_nn_pooling_AvgPool1d` .
 
     Args:
         x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
                           shape [N, C, L]. where `N` is batch size, `C` is the number of channels,
-                          `L` is the length of the feature. The data type if float32 or float64.
+                          `L` is the length of the feature. The data type is float32 or float64.
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+            it must contain an integer.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
-            then the input is implicitly zero-padded on both sides for padding number of points.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
+                          mode, default is `True`.
         ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
-            If it is set to False, the floor function will be used. Default False
+            If it is set to False, the floor function will be used. The default value is False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
+        ValueError: If `padding` is a list or tuple but its length is greater than 1.
+        ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
 
-
     Examples:
-
         .. code-block:: python
-
           import paddle
           import paddle.nn.functional as F
           paddle.disable_static()
-
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          pool_out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
-          # pool_out shape: [1, 3, 16]
-
+          out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # out shape: [1, 3, 16]
     """
     """NCL to NCHW"""
     data_format = "NCHW"
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'avg_pool1d')
-    check_input(x, 3)
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool1d')
+    _check_input(x, 3)
     x = unsqueeze(x, [2])
-    kernel_size = utils.convert_to_list(kernel_size, 1, 'pool_size')
+    kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
     kernel_size = [1] + kernel_size
     if stride is None:
         stride = kernel_size
@@ -220,33 +226,20 @@ def avg_pool1d(x,
         stride = utils.convert_to_list(stride, 1, 'pool_stride')
         stride = [1] + stride
 
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0]
+    channel_last = _channel_last("NCL", 1)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, channel_last=channel_last, ceil_mode=ceil_mode)
 
-    padding = update_padding1d(padding, "avg")
+    # use 2d to implenment 1d should expand padding in advance.
+    padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
         output = core.ops.pool2d(
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
             False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', not count_include_pad, 'ceil_mode',
-            ceil_mode, 'use_mkldnn', False, 'exclusive', True, 'data_format',
-            data_format)
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
         return squeeze(output, [2])
 
     op_type = 'pool2d'
@@ -275,126 +268,103 @@ def avg_pool1d(x,
     return squeeze(pool_out, [2])
 
 
-def max_pool1d(x,
+def avg_pool2d(x,
                kernel_size,
                stride=None,
                padding=0,
-               return_indices=False,
                ceil_mode=False,
+               count_include_pad=True,
+               divisor_override=None,
+               data_format="NCHW",
                name=None):
     """
-
-    Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
-
-    ..  math::
-
-       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
-
+    This API implements average pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_AvgPool2d` .
+ 
     Args:
-        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
-                          shape [N, C, L], where `N` is batch size, `C` is the number of channels,
-                          `L` is the length of the feature. The data type if float32 or float64.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
-            If it is set to False, the floor function will be used. Default False.
+        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If it is a tuple or list,
+            it must contain two integers, (kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The stride size. If it is a tuple or list,
+            it must contain two integers, (stride_Height, stride_Width).
+            Otherwise, the stride size will be a square of an int.
+
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
-
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
         ShapeError: If the output's shape calculated is not greater than 0.
-
-
     Examples:
-
         .. code-block:: python
-
           import paddle
           import paddle.nn.functional as F
+          import numpy as np
           paddle.disable_static()
-
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
-          # pool_out shape: [1, 3, 16]
-
-          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
-          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
-
+          # avg pool2d
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          out = F.avg_pool2d(x,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          # out.shape [1, 3, 16, 16]
     """
-    """NCL to NCHW"""
-    data_format = "NCHW"
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'max_pool1d')
-    check_input(x, 3)
-    x = unsqueeze(x, [2])
-    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
     if stride is None:
         stride = kernel_size
     else:
-        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
-
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0]
+        stride = utils.convert_to_list(stride, 2, 'pool_stride')
 
-    padding = update_padding1d(padding, 'max')
+    channel_last = _channel_last(data_format, 2)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 2, channel_last, ceil_mode=ceil_mode)
 
     if in_dygraph_mode():
-        pool_out = core.ops.max_pool2d_with_index(
-            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
-            'paddings', padding, 'padding_algorithm', padding_algorithm,
-            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
-            'exclusive', True, 'data_format', data_format)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+        output = core.ops.pool2d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
+            False, 'padding_algorithm', padding_algorithm, 'strides', stride,
+            'paddings', padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            _check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1]) / divisor_override
 
-    op_type = 'max_pool2d_with_index'
+    op_type = 'pool2d'
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
-    mask = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out, "Mask": mask}
 
     helper.append_op(
         type=op_type,
         inputs={"X": x},
-        outputs=outputs,
+        outputs={"Out": pool_out},
         attrs={
-            "pooling_type": 'max',
+            "pooling_type": "avg",
             "ksize": kernel_size,
             "global_pooling": False,
             "strides": stride,
@@ -403,335 +373,211 @@ def max_pool1d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": True,
+            "exclusive": not count_include_pad,
             "data_format": data_format,
         })
 
-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
-
-
-def adaptive_avg_pool1d(x, output_size, name=None):
-    """
-
-    This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For average adaptive pool1d:
-
-    ..  math::
-
-        lstart &= floor(i * L_{in} / L_{out})
-
-        lend &= ceil((i + 1) * L_{in} / L_{out})
-
-        Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
-
-    Args:
-        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
-                              with shape [N, C, L].  The format of input tensor is NCL,
-                              where N is batch size, C is the number of channels, L is the
-                              length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
-        name(str, optional): For detailed information, please refer
-                                 to :ref:`api_guide_Name`. Usually name is no need to set and
-                                 None by default.
-
-    Returns:
-            Tensor: The output tensor of adaptive average pooling result. The data type is same
-                      as input tensor.
-
-    Raises:
-            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
-
-    Examples:
-        .. code-block:: python
-
-              # average adaptive pool1d
-              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-              # output shape is [N, C, m], adaptive pool divide L dimension
-              # of input data into m grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(m):
-              #         lstart = floor(i * L / m)
-              #         lend = ceil((i + 1) * L / m)
-              #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-              #
-              import paddle
-              import paddle.nn.functional as F
-              paddle.disable_static()
-
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-              pool_out = F.adaptive_average_pool1d(data, output_size=16)
-              # pool_out shape: [1, 3, 16])
-    """
-    pool_type = 'avg'
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
-                             'adaptive_pool2d')
-    check_input(x, 3)
-    check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
-
-    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
-
-    l_type = "pool2d"
-    x = unsqueeze(x, [2])
-    if in_dygraph_mode():
-        pool_out = core.ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
-                                   pool_size, 'adaptive', True)
-        return squeeze(pool_out, [2])
-
-    helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-
-    outputs = {"Out": pool_out}
-    helper.append_op(
-        type=l_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
-        })
-
-    return squeeze(pool_out, [2])
+    if divisor_override is None:
+        return pool_out
+    else:
+        _check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
 
 
-def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+def avg_pool3d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               ceil_mode=False,
+               count_include_pad=False,
+               divisor_override=None,
+               data_format="NCDHW",
+               name=None):
     """
-    This operation applies a 1D adaptive max pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For max adaptive pool1d:
-
-    ..  math::
-
-        lstart &= floor(i * L_{in} / L_{out})
-
-        lend &= ceil((i + 1) * L_{in} / L_{out})
-
-        Output(i) &= max(Input[lstart:lend])}
+    This API implements average pooling 3d operation.
+    See more details in :ref:`api_nn_pooling_AvgPool3d` .
 
     Args:
-        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
-                              with shape [N, C, L].  The format of input tensor is NCL,
-                              where N is batch size, C is the number of channels, L is the
-                              length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
-        return_indices (bool): If true, the index of max pooling point will be returned along
-                with outputs. It cannot be set in average pooling type. Default False.
+        x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
+                          shape [N, C, D, H, W], where `N` represents the batch size, `C` represents
+                          the number of channels, `D`, `H` and `W` represent the depth, height and width of the feature respectively.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
-                                 to :ref:`api_guide_Name`. Usually name is no need to set and
-                                 None by default.
-
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
     Returns:
-            Tensor: The output tensor of adaptive pooling result. The data type is same
-                      as input tensor.
-
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
     Raises:
-            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
-
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
     Examples:
         .. code-block:: python
-
-              # max adaptive pool1d
-              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-              # output shape is [N, C, m], adaptive pool divide L dimension
-              # of input data into m grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(m):
-              #         lstart = floor(i * L / m)
-              #         lend = ceil((i + 1) * L / m)
-              #         output[:, :, i] = max(input[:, :, lstart: lend])
-              #
-              import paddle
-              import paddle.nn.functional as F
-              paddle.disable_static()
-
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-              pool_out = F.adaptive_max_pool1d(data, output_size=16)
-              # pool_out shape: [1, 3, 16])
-
-              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
-              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
-
+          import paddle.fluid as fluid
+          import paddle
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          # avg pool3d
+          out = paddle.nn.functional.avg_pool3d(
+                                            x,
+                                            kernel_size = 2,
+                                            stride = 2,
+                                            padding=0)
+          # out.shape: [1, 3, 16, 16, 16]
     """
-    pool_type = 'max'
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
-                             'adaptive_max_pool1d')
-    check_input(x, 3)
-    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
-    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
-
-    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
 
-    l_type = 'max_pool2d_with_index'
+    channel_last = _channel_last(data_format, 3)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
 
-    x = unsqueeze(x, [2])
     if in_dygraph_mode():
-        pool_out = core.ops.max_pool2d_with_index(
-            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+        output = core.ops.pool3d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
+            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            _check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1] *
+                             kernel_size[2]) / divisor_override
 
-    helper = LayerHelper(l_type, **locals())
+    op_type = "pool3d"
+    helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-
-    mask = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out, "Mask": mask}
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out}
 
     helper.append_op(
-        type=l_type,
+        type=op_type,
         inputs={"X": x},
         outputs=outputs,
         attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
         })
 
-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+    if divisor_override is None:
+        return pool_out
+    else:
+        _check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1] *
+                           kernel_size[2]) / divisor_override
 
 
-def max_pool2d(x,
+def max_pool1d(x,
                kernel_size,
                stride=None,
                padding=0,
                return_indices=False,
                ceil_mode=False,
-               data_format="NCHW",
                name=None):
     """
-    This operation applies 2D max pooling over input feature based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-           stride: stride
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
-                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                                                   \text{stride[1]} \times w + n)
-           $$
+    This API implements max pooling 1d opereation.
+    See more details in :ref:`api_nn_pooling_MaxPool1d` .
 
     Args:
-        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
-                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
-                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
-                          `H` is the height of the feature, and `W` is the width of the
-                          feature. The data type if float32 or float64.
+        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
+                          shape [N, C, L], where `N` is batch size, `C` is the number of channels,
+                          `L` is the length of the feature. The data type if float32 or float64.
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int.
+            it must contain an integer.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An integer, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
+
     Examples:
         .. code-block:: python
           import paddle
           import paddle.nn.functional as F
-          import numpy as np
           paddle.disable_static()
-
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          output = F.max_pool2d(input,
-                                kernel_size=2,
-                                stride=2, padding=0)
-          # output.shape [1, 3, 16, 16]
-
-          # for return_indices=True
-          output, max_indices = F.max_pool2d(input,
-                                             kernel_size=2,
-                                             stride=2,
-                                             padding=0,
-                                             return_indices=True)
-          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # pool_out shape: [1, 3, 16]
+          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
+          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
     """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
-    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
+    """NCL to NCHW"""
+    data_format = "NCHW"
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool1d')
+    _check_input(x, 3)
+    x = unsqueeze(x, [2])
+    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
     if stride is None:
         stride = kernel_size
     else:
-        stride = utils.convert_to_list(stride, 2, 'pool_stride')
+        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
 
-    if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0]
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, ceil_mode=ceil_mode)
 
-    padding = update_padding2d(padding, data_format)
+    # use 2d to implenment 1d should expand padding in advance.
+    padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
-        output = core.ops.max_pool2d_with_index(
+        pool_out = core.ops.max_pool2d_with_index(
             x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
             'paddings', padding, 'padding_algorithm', padding_algorithm,
             'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
             'exclusive', True, 'data_format', data_format)
-        return output if return_indices else output[0]
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
 
     op_type = 'max_pool2d_with_index'
     helper = LayerHelper(op_type, **locals())
@@ -758,36 +604,21 @@ def max_pool2d(x,
             "data_format": data_format,
         })
 
-    return (pool_out, mask) if return_indices else pool_out
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
 
 
-def avg_pool2d(x,
+def max_pool2d(x,
                kernel_size,
                stride=None,
                padding=0,
+               return_indices=False,
                ceil_mode=False,
-               count_include_pad=True,
-               divisor_override=None,
                data_format="NCHW",
                name=None):
     """
-    This operation applies 2D average pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
-           $$
+    This API implements max pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_MaxPool2d` .
 
     Args:
         x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
@@ -796,30 +627,26 @@ def avg_pool2d(x,
                           `H` is the height of the feature, and `W` is the width of the
                           feature. The data type if float32 or float64.
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
+            it must contain two integers, (kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            it must contain two integers, (stride_Height, stride_Width).
             Otherwise, the pool stride size will be a square of an int.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
     Raises:
@@ -832,87 +659,71 @@ def avg_pool2d(x,
           import paddle.nn.functional as F
           import numpy as np
           paddle.disable_static()
-
-          # avg pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          output = F.avg_pool2d(input,
+          # max pool2d
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          out = F.max_pool2d(x,
                                 kernel_size=2,
                                 stride=2, padding=0)
           # output.shape [1, 3, 16, 16]
-
+          # for return_indices=True
+          out, max_indices = F.max_pool2d(x,
+                                             kernel_size=2,
+                                             stride=2,
+                                             padding=0,
+                                             return_indices=True)
+          # out.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
     kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
     if stride is None:
         stride = kernel_size
     else:
         stride = utils.convert_to_list(stride, 2, 'pool_stride')
 
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0]
-
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
             "Attr(data_format): %s." % str(data_format))
-    pool_padding = update_padding2d(padding, data_format)
+
+    channel_last = True if data_format == "NHWC" else False
+
+    padding, padding_algorithm = _update_padding_nd(
+        padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode)
 
     if in_dygraph_mode():
-        output = core.ops.pool2d(
-            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
-            False, 'padding_algorithm', padding_algorithm, 'strides', stride,
-            'paddings', pool_padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
-        if divisor_override is None:
-            return output
-        else:
-            check_instance(divisor_override, "divisor_override")
-            return output * (kernel_size[0] * kernel_size[1]) / divisor_override
+        output = core.ops.max_pool2d_with_index(
+            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
+            'paddings', padding, 'padding_algorithm', padding_algorithm,
+            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
+            'exclusive', True, 'data_format', data_format)
+        return output if return_indices else output[0]
 
-    op_type = 'pool2d'
+    op_type = 'max_pool2d_with_index'
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
 
     helper.append_op(
         type=op_type,
         inputs={"X": x},
-        outputs={"Out": pool_out},
+        outputs=outputs,
         attrs={
-            "pooling_type": "avg",
+            "pooling_type": 'max',
             "ksize": kernel_size,
             "global_pooling": False,
             "strides": stride,
-            "paddings": pool_padding,
+            "paddings": padding,
             "padding_algorithm": padding_algorithm,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not count_include_pad,
+            "exclusive": True,
             "data_format": data_format,
         })
 
-    if divisor_override is None:
-        return pool_out
-    else:
-        check_instance(divisor_override, "divisor_override")
-        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
+    return (pool_out, mask) if return_indices else pool_out
 
 
 def max_pool3d(x,
@@ -924,47 +735,25 @@ def max_pool3d(x,
                data_format="NCDHW",
                name=None):
     """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, D_{in}, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-           $$
-           \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, ksize[0]-1} \max_{m=0, \ldots, ksize[1]-1} \max_{n=0, \ldots, ksize[2]-1} \\
-                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
-                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
-           $$
-
+    This API implements max pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_MaxPool3d` .
     Args:
         x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of
-                          input tensor is `"NCDHW"` or `"NDHWC"`, where `N` is batch size, `C` is
-                          the number of channels, `D` is the depth of the feature,
-                          `H` is the height of the feature, and `W` is the width
-                          of the feature.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` or `"NDHWC"`, where N represents batch size, C represents the number of channels, D, H and W represent the depth, height and width of the feature respectively. 
+        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
             Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
         return_indices (bool): Whether to return the max indices along with the outputs.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
@@ -973,7 +762,6 @@ def max_pool3d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
     Raises:
@@ -986,23 +774,20 @@ def max_pool3d(x,
           import paddle.nn.functional as F
           import numpy as np
           paddle.disable_static()
-
           # max pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          output = F.max_pool2d(input,
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output = F.max_pool2d(x,
                                 kernel_size=2,
                                 stride=2, padding=0)
           output.shape [1, 3, 16, 16, 16]
-
           # for return_indices=True
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          output, max_indices = paddle.nn.functional.max_pool3d(input,
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output, max_indices = paddle.nn.functional.max_pool3d(x,
                                         kernel_size = 2,
                                         stride = 2,
                                         padding=0,
                                         return_indices=True)
           # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
-
     """
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
     kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
@@ -1011,29 +796,10 @@ def max_pool3d(x,
     else:
         stride = utils.convert_to_list(stride, 3, 'pool_stride')
 
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0, 0]
+    channel_last = _channel_last(data_format, 3)
 
-    if data_format not in ["NCDHW", "NDHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s" % str(data_format))
-    padding = update_padding3d(padding, data_format)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
 
     if in_dygraph_mode():
         output = core.ops.max_pool3d_with_index(
@@ -1071,170 +837,83 @@ def max_pool3d(x,
     return (pool_out, mask) if return_indices else pool_out
 
 
-def avg_pool3d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               ceil_mode=False,
-               count_include_pad=False,
-               divisor_override=None,
-               data_format="NCDHW",
-               name=None):
+def adaptive_avg_pool1d(x, output_size, name=None):
     """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
-
+    This API implements adaptive average pooling 1d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
+    
     Args:
-        input (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
-                          shape [N, C, D, H, W], where `N` is batch size, `C` is
-                          the number of channels, `D` is the depth of the feature,
-                          `H` is the height of the feature, and `W` is the width
-                          of the feature.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
-            is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
-                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
         name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
     Returns:
-        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+            Tensor: The output tensor of adaptive average pooling result. The data type is same
+                      as input tensor.
     Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
+            ValueError: 'output_size' should be an integer or list or tuple with length as 1.
     Examples:
         .. code-block:: python
-          import paddle.fluid as fluid
-          import paddle
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          # avg pool3d
-          pool3d = paddle.nn.functional.avg_pool3d(
-                                            input,
-                                            kernel_size = 2,
-                                            stride = 2,
-                                            padding=0)
-          # pool3d.shape: [1, 3, 16, 16, 16]
-    """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
-    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
-    if stride is None:
-        stride = kernel_size
-    else:
-        stride = utils.convert_to_list(stride, 3, 'pool_stride')
-
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0, 0]
+              # average adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_average_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+    """
+    pool_type = 'avg'
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'adaptive_pool2d')
+    _check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
 
-    if data_format not in ["NCDHW", "NDHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s" % str(data_format))
-    padding = update_padding3d(padding, data_format)
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
 
+    l_type = "pool2d"
+    x = unsqueeze(x, [2])
     if in_dygraph_mode():
-        output = core.ops.pool3d(
-            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
-            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
-        if divisor_override is None:
-            return output
-        else:
-            check_instance(divisor_override, "divisor_override")
-            return output * (kernel_size[0] * kernel_size[1] *
-                             kernel_size[2]) / divisor_override
+        pool_out = core.ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
+                                   pool_size, 'adaptive', True)
+        return squeeze(pool_out, [2])
 
-    op_type = "pool3d"
-    helper = LayerHelper(op_type, **locals())
+    helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out}
 
+    outputs = {"Out": pool_out}
     helper.append_op(
-        type=op_type,
+        type=l_type,
         inputs={"X": x},
         outputs=outputs,
         attrs={
-            "pooling_type": 'avg',
-            "ksize": kernel_size,
-            "global_pooling": False,
-            "strides": stride,
-            "paddings": padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": not count_include_pad,
-            "data_format": data_format,
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
         })
 
-    if divisor_override is None:
-        return pool_out
-    else:
-        check_instance(divisor_override, "divisor_override")
-        return pool_out * (kernel_size[0] * kernel_size[1] *
-                           kernel_size[2]) / divisor_override
+    return squeeze(pool_out, [2])
 
 
 def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
     """
-
-    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .
-
-    For avg adaptive pool2d:
-
-    ..  math::
-
-       hstart &= floor(i * H_{in} / H_{out})
-
-       hend &= ceil((i + 1) * H_{in} / H_{out})
-
-       wstart &= floor(j * W_{in} / W_{out})
-
-       wend &= ceil((j + 1) * W_{in} / W_{out})
-
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+    This API implements adaptive average pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .
 
     Args:
         x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor.
@@ -1248,16 +927,12 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
-
     Raises:
         ValueError: If `data_format` is not "NCHW" or "NHWC".
-
     Examples:
         .. code-block:: python
-
             # adaptive avg pool2d
             # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
             # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
@@ -1279,10 +954,10 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
             input_data = np.random.rand(2, 3, 32, 32)
             x = paddle.to_tensor(input_data)
             # x.shape is [2, 3, 32, 32]
-            pool_out = paddle.nn.functional.adaptive_avg_pool2d(
+            out = paddle.nn.functional.adaptive_avg_pool2d(
                             x = x,
                             output_size=[3, 3])
-            # pool_out.shape is [2, 3, 3, 3]
+            # out.shape is [2, 3, 3, 3]
     """
     if not in_dygraph_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
@@ -1337,28 +1012,8 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
 
 def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     """
-
-    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .
-
-    For avg adaptive pool3d:
-
-    ..  math::
-
-      dstart &= floor(i * D_{in} / D_{out})
-
-      dend &= ceil((i + 1) * D_{in} / D_{out})
-
-      hstart &= floor(j * H_{in} / H_{out})
-
-      hend &= ceil((j + 1) * H_{in} / H_{out})
-
-      wstart &= floor(k * W_{in} / W_{out})
-
-      wend &= ceil((k + 1) * W_{in} / W_{out})
-
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+    This API implements adaptive average pooling 3d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .
 
     Args:
         x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
@@ -1372,16 +1027,12 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
-
     Raises:
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-
     Examples:
         .. code-block:: python
-
             # adaptive avg pool3d
             # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
             # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
@@ -1406,10 +1057,10 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
             input_data = np.random.rand(2, 3, 8, 32, 32)
             x = paddle.to_tensor(input_data)
             # x.shape is [2, 3, 8, 32, 32]
-            pool_out = paddle.nn.functional.adaptive_avg_pool3d(
+            out = paddle.nn.functional.adaptive_avg_pool3d(
                             x = x,
                             output_size=[3, 3, 3])
-            # pool_out.shape is [2, 3, 3, 3, 3]
+            # out.shape is [2, 3, 3, 3, 3]
     """
     if not in_dygraph_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
@@ -1461,3 +1112,257 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
         })
 
     return pool_out
+
+
+def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+    """
+    This API implements adaptive max pooling 1d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveMaxPool1d` .
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+                with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
+    Returns:
+            Tensor: The output tensor of adaptive pooling result. The data type is same
+                      as input tensor.
+    Raises:
+            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+    Examples:
+        .. code-block:: python
+              # max adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = max(input[:, :, lstart: lend])
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_max_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
+              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
+    """
+    pool_type = 'max'
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                             'adaptive_max_pool1d')
+    _check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
+
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+
+    l_type = 'max_pool2d_with_index'
+
+    x = unsqueeze(x, [2])
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+
+
+def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
+    """
+        This operation applies a 2D adaptive max pooling on input tensor.
+        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
+        Args:
+            x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
+            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
+            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+        Returns:
+            Tensor: The output tensor of adaptive max pool2d result. The data type is same as input tensor.
+        Examples:
+            .. code-block:: python
+              # max adaptive pool2d
+              # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
+              # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+              # of input data into m*n grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         for j in range(n):
+              #             hstart = floor(i * H / m)
+              #             hend = ceil((i + 1) * H / m)
+              #             wstart = floor(i * W / n)
+              #             wend = ceil((i + 1) * W / n)
+              #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+              #
+              import paddle
+              import numpy as np
+              paddle.disable_static()
+              input_data = np.random.rand(2, 3, 32, 32)
+              x = paddle.to_tensor(input_data)
+              # x.shape is [2, 3, 32, 32]
+              out = paddle.nn.functional.adaptive_max_pool2d(
+                            x = x,
+                            output_size=[3, 3])
+              # out.shape is [2, 3, 3, 3]
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_max_pool2d')
+    _check_input(x, 4)
+    #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool2d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool2d')
+
+    in_h, in_w = x.shape[2:4]
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 2, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_h
+        if output_size[1] == None:
+            output_size[1] = in_w
+
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
+        return pool_out if return_indices else pool_out[0]
+
+    l_type = 'max_pool2d_with_index'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        })
+    #return (pool_out, mask) if return_indices else pool_out
+    return pool_out
+
+
+def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
+    """
+        This operation applies a 3D adaptive max pooling on input tensor.
+        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
+        Args:
+            x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
+            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+        Returns:
+            Tensor: The output tensor of adaptive max pool3d result. The data type is same as input tensor.
+        Examples:
+            .. code-block:: python
+              # adaptive max pool3d
+              # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
+              # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+              # of input data into m*n grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(l):
+              #         for j in range(m):
+              #             for k in range(n):
+              #                 dstart = floor(i * D / l)
+              #                 dend = ceil((i + 1) * D / l)
+              #                 hstart = floor(i * H / m)
+              #                 hend = ceil((i + 1) * H / m)
+              #                 wstart = floor(i * W / n)
+              #                 wend = ceil((i + 1) * W / n)
+              #             output[:, :, i, j, k] = max(input[:, :, dstart: dend, hstart: hend, wstart: wend])
+              #
+              import paddle
+              import numpy as np
+              paddle.disable_static()
+              input_data = np.random.rand(2, 3, 8, 32, 32)
+              x = paddle.to_tensor(input_data)
+              # x.shape is [2, 3, 8, 32, 32]
+              out = paddle.nn.functional.adaptive_max_pool3d(
+                            x = x,
+                            output_size=[3, 3, 3])
+              # out.shape is [2, 3, 3, 3, 3]
+    """
+
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_max_pool3d')
+    _check_input(x, 5)
+    #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool3d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool3d')
+
+    in_l, in_h, in_w = x.shape[2:5]
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 3, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_l
+        if output_size[1] == None:
+            output_size[1] = in_h
+        if output_size[2] == None:
+            output_size[2] = in_w
+
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool3d_with_index(
+            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
+        return pool_out if return_indices else pool_out[0]
+
+    l_type = 'max_pool3d_with_index'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        })
+
+    return (pool_out, mask) if return_indices else pool_out
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 3399e4e34c9e3bc61fde515fc1917deb213f3d0b..6eac15cd694e51c24f94f7686b6e63fa7c6cbf09 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -66,16 +66,18 @@ from .common import Dropout  #DEFINE_ALIAS
 from .common import Dropout2D  #DEFINE_ALIAS
 from .common import Dropout3D  #DEFINE_ALIAS
 from .common import AlphaDropout  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
 from .pooling import AvgPool1d  #DEFINE_ALIAS
-from .pooling import MaxPool1d  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
 from .pooling import AvgPool2d  #DEFINE_ALIAS
-from .pooling import MaxPool2d  #DEFINE_ALIAS
 from .pooling import AvgPool3d  #DEFINE_ALIAS
+from .pooling import MaxPool1d  #DEFINE_ALIAS
+from .pooling import MaxPool2d  #DEFINE_ALIAS
 from .pooling import MaxPool3d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool2d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool3d  #DEFINE_ALIAS
 from .conv import Conv1d  #DEFINE_ALIAS
 from .conv import Conv2d  #DEFINE_ALIAS
 from .conv import Conv3d  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 7d0e59fb7575c9d15d28e88a462aed4ddba47fb9..4e342c00528a2c0115940bb7f695e1ed5b582382 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -99,7 +99,8 @@ class _ConvNd(layers.Layer):
                 raise ValueError("in_channels must be divisible by groups.")
 
             if padding_mode in {'reflect', 'replicate', 'circular'}:
-                _paired_padding = utils.convert_to_list(padding, 2, 'padding')
+                _paired_padding = utils.convert_to_list(padding, dims,
+                                                        'padding')
                 self._reversed_padding_repeated_twice = _reverse_repeat_list(
                     _paired_padding, 2)
 
@@ -318,62 +319,80 @@ class Conv2d(_ConvNd):
     output of the convolution, and the corresponding activation function is
     applied to the final result.
     For each input :math:`X`, the equation is:
-    .. math::
-        Out = \\sigma (W \\ast X + b)
+
+    ..  math::
+
+        Out = \sigma (W \\ast X + b)
+
     Where:
+
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    
     Parameters:
-        in_channels(int): The number of channels in the input image.
-        out_channels(int): The number of channels produced by convolution.
-        kernel_size (int|list|tuple): The size of convolution kernel.
-        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`on both sides 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` .
-        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: 1.
-        groups (int, optional): The groups number of the Conv2d Layer. According to grouped
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+            connected to the second half of the input channels. The default value is 1.
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d.
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+            is not set, the bias is initialized zero. The default value is None.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filter of this layer.
+
         **bias** (Parameter or None): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel_size[0] - 1) + 1))}{strides[0]} + 1 \\\\
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel_size[1] - 1) + 1))}{strides[1]} + 1
+
+        ..  math::
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
     Examples:
+
         .. code-block:: python
+
           import numpy as np
           import paddle
           import paddle.nn as nn
@@ -646,35 +665,29 @@ class ConvTranspose2d(_ConvNd):
     The details of convolution transpose layer, please refer to the following explanation and references
     `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
     For each input :math:`X`, the equation is:
-    .. math::
+
+    ..  math::
+
         Out = \sigma (W \\ast X + b)
+
     Where:
+
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+    
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
         kernel_size(int|list|uple): The kernel size. If kernel_size is a tuple,
             it must contain two integers, (kernel_size_H, kernel_size_W).
             Otherwise, the kernel will be a square.
-        output_padding(int|list|tuple, optional): Additional size added to one side
-            of each dimension in the output shape. Default: 0.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides 
@@ -682,9 +695,8 @@ class ConvTranspose2d(_ConvNd):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
         dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
@@ -694,29 +706,46 @@ class ConvTranspose2d(_ConvNd):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d_transpose.
+        bias_attr(ParamAttr|bool, optional): The attribute for the bias of conv2d_transpose.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filters of this layer.
+
         **bias** (Parameter or None): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
+
+        ..  math::
+
+           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel\_size[0] - 1) + 1
+
+           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel\_size[1] - 1) + 1
+
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] )
+
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+
     Examples:
+
        .. code-block:: python
+
           import numpy as np
           import paddle
           import paddle.nn as nn
@@ -791,66 +820,86 @@ class Conv3d(_ConvNd):
     provided, bias is added to the output of the convolution, and the
     corresponding activation function is applied to the final result.
     For each input :math:`X`, the equation is:
-    .. math::
+
+    ..  math::
+
         Out = \sigma (W \\ast X + b)
+
     In the above equation:
+
     * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
     Parameters:
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
-        kernel_size (int|list|tuple, optional): The size of the convolving kernel.
-        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
             contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
             stride_D = stride_H = stride_W = stride. The default value is 1.
-        padding (int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups (int, optional): The groups number of the Conv3d Layer. According to grouped
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. The default value is 1.
-        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
             :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. The default value is None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filters of this layer.
+
         **bias** (Parameter): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
+
+        ..  math::
+
+           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
+
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
+
     Examples:
+
         .. code-block:: python
+
           import numpy as np
           
           import paddle
@@ -936,17 +985,22 @@ class ConvTranspose3d(_ConvNd):
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
     For each input :math:`X`, the equation is:
-    .. math::
+    
+    ..  math::
+
         Out = \sigma (W \\ast X + b)
+
     In the above equation:
+
     * :math:`X`: Input value, a tensor with NCDHW format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
+
     **Note**:
+
           The conv_transpose3d can be seen as the backward of the conv3d. For conv3d, 
           when stride > 1, conv3d maps multiple input shape to the same output shape, 
           so for conv_transpose3d, when stride > 1, input shape maps multiple output shape.
@@ -957,6 +1011,7 @@ class ConvTranspose3d(_ConvNd):
           and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
           between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
           conv_transpose3d can compute the kernel size automatically.
+
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
@@ -985,11 +1040,11 @@ class ConvTranspose3d(_ConvNd):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             The default value is 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
@@ -999,24 +1054,38 @@ class ConvTranspose3d(_ConvNd):
             filter_size, padding, and stride to calculate output_size.
             if output_size and filter_size are specified at the same time, They
             should follow the formula above. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filters of this layer.
+
         **bias** (Parameter): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
-           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel_size[2] - 1) + 1 \\\\
+
+        ..  math::
+
+           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel\_size[0] - 1) + 1
+           
+           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel\_size[1] - 1) + 1
+           
+           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel\_size[2] - 1) + 1
+           
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
     Examples:
+
        .. code-block:: python
+
           import numpy as np
           import paddle
           import paddle.nn as nn
@@ -1024,7 +1093,7 @@ class ConvTranspose3d(_ConvNd):
           
           paddle.disable_static()
           x_var = paddle.to_tensor(x)
-          conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
+          conv = nn.ConvTranspose3d(4, 6, (3, 3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
           print(y_np.shape)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index de10e77eb1c000e66a7a914dc94ce39a6268bb61..a1c7d28a85e762ebb381c5f0075df1c7b00396f7 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -634,9 +634,12 @@ class KLDivLoss(fluid.dygraph.Layer):
             Default is ``'mean'``.
 
     Shape:
-      - input: (N, *) where * means, any number of additional dimensions.
-      - label: (N, *), same shape as input
-      - output: tensor with shape: (1) by default.
+
+        - input (Tensor): (N, *), where * means, any number of additional dimensions.
+
+        - label (Tensor): (N, *), same shape as input.
+
+        - output (Tensor): tensor with shape: [1] by default.
 
 
     Examples:
@@ -646,7 +649,7 @@ class KLDivLoss(fluid.dygraph.Layer):
             import numpy as np
             import paddle.nn as nn
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
             shape = (5, 20)
             x = np.random.uniform(-10, 10, shape).astype('float32')
@@ -654,26 +657,26 @@ class KLDivLoss(fluid.dygraph.Layer):
 
             # 'batchmean' reduction, loss shape will be [N]
             kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[5]
 
             # 'mean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='mean')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[1]
 
             # 'sum' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='sum')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[1]
 
             # 'none' reduction, loss shape is same with X shape
             kldiv_criterion = nn.KLDivLoss(reduction='none')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[5, 20]
     """
 
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index c7855b23bf6e6861326533e3cc93d7f7c5bd4ca2..4d25418579d74ae896f8ca590400a0a334047e93 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -27,6 +27,7 @@
 
 # TODO: define normalization api  
 
+import six
 from ...fluid.dygraph.nn import InstanceNorm
 
 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
@@ -36,7 +37,6 @@ from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
 
 from ...fluid.dygraph import layers
-
 from ...framework import get_default_dtype, set_default_dtype
 from ...fluid.framework import in_dygraph_mode
 
@@ -50,6 +50,7 @@ from ..functional import batch_norm, layer_norm, instance_norm
 import numpy as np
 import numbers
 import warnings
+from ...fluid.dygraph.base import no_grad
 
 __all__ = [
     'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
@@ -566,17 +567,28 @@ class _BatchNormBase(layers.Layer):
         param_shape = [num_features]
 
         # create parameter
-        self.weight = self.create_parameter(
-            attr=self._weight_attr,
-            shape=param_shape,
-            default_initializer=Constant(1.0))
-        self.weight.stop_gradient = (self._weight_attr is False) or (
-            self._weight_attr and self._weight_attr.learning_rate == 0.)
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
 
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=param_shape, is_bias=True)
-        self.bias.stop_gradient = (self._bias_attr is False) or (
-            self._bias_attr and self._bias_attr.learning_rate == 0.)
+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
 
         moving_mean_name = None
         moving_variance_name = None
@@ -611,6 +623,7 @@ class _BatchNormBase(layers.Layer):
         self._epsilon = epsilon
         self._fuse_with_relu = False
         self._track_running_stats = track_running_stats
+        self._name = name
 
     def _check_input_dim(self, input):
         raise NotImplementedError("BatchNorm Base error")
@@ -898,7 +911,7 @@ class BatchNorm3d(_BatchNormBase):
                 len(input.shape)))
 
 
-class SyncBatchNorm(layers.Layer):
+class SyncBatchNorm(_BatchNormBase):
     """
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
     It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
@@ -984,72 +997,16 @@ class SyncBatchNorm(layers.Layer):
 
     def __init__(self,
                  num_features,
-                 epsilon=1e-05,
                  momentum=0.9,
-                 track_running_stats=True,
+                 epsilon=1e-05,
                  weight_attr=None,
                  bias_attr=None,
                  data_format='NCHW',
+                 track_running_stats=True,
                  name=None):
-        super(SyncBatchNorm, self).__init__()
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._num_features = num_features
-        self._data_layout = data_format
-        self._momentum = momentum
-        self._epsilon = epsilon
-        self._track_running_stats = track_running_stats
-
-        if self._track_running_stats == False:
-            warnings.warn(
-                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
-            )
-
-        param_shape = [self._num_features]
-
-        # create parameter
-        if weight_attr == False:
-            self.weight = self.create_parameter(
-                attr=None, shape=param_shape, default_initializer=Constant(1.0))
-            self.weight.stop_gradient = True
-        else:
-            self.weight = self.create_parameter(
-                attr=self._weight_attr,
-                shape=param_shape,
-                default_initializer=Constant(1.0))
-            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        if bias_attr == False:
-            self.bias = self.create_parameter(
-                attr=None,
-                shape=param_shape,
-                default_initializer=Constant(0.0),
-                is_bias=True)
-            self.bias.stop_gradient = True
-        else:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr, shape=param_shape, is_bias=True)
-            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        self._mean = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._mean.stop_gradient = True
-
-        self._variance = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._variance.stop_gradient = True
+        super(SyncBatchNorm,
+              self).__init__(num_features, momentum, epsilon, weight_attr,
+                             bias_attr, data_format, track_running_stats, name)
 
     def forward(self, x):
         # create output
@@ -1063,7 +1020,7 @@ class SyncBatchNorm(layers.Layer):
         if in_dygraph_mode():
             attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                      "is_test", not self.training, "data_layout",
-                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
+                     self._data_format, "use_mkldnn", False, "fuse_with_relu",
                      False, "use_global_stats", False, 'trainable_statistics',
                      False)
             sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
@@ -1073,13 +1030,13 @@ class SyncBatchNorm(layers.Layer):
             return sync_batch_norm_out
 
         check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'BatchNorm')
+                                 'SyncBatchNorm')
 
         attrs = {
             "momentum": self._momentum,
             "epsilon": self._epsilon,
             "is_test": not self.training,
-            "data_layout": self._data_layout,
+            "data_layout": self._data_format,
             "use_mkldnn": False,
             "fuse_with_relu": False,
             "use_global_stats": False,
@@ -1112,3 +1069,45 @@ class SyncBatchNorm(layers.Layer):
         self._helper.append_op(
             type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
         return sync_batch_norm_out
+
+    @classmethod
+    def convert_sync_batchnorm(cls, layer):
+        """
+        Helper function to convert :class: `paddle.nn.BatchNorm*d` layers in the model to :class: `paddle.nn.SyncBatchNorm` layers.
+
+        Parameters:
+            layer(paddle.nn.Layer): model containing one or more `BatchNorm*d` layers.
+
+        Returns:
+            The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.
+
+        Examples:
+
+            .. code-block:: python
+                import paddle
+                import paddle.nn as nn
+
+                paddle.disable_static()
+                model = nn.Sequential(nn.Conv2d(3, 5, 3), nn.BatchNorm2d(5))
+                sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+        """
+        layer_output = layer
+        if isinstance(layer, _BatchNormBase):
+            layer_output = SyncBatchNorm(layer._num_features, layer._epsilon,
+                                         layer._momentum, layer._weight_attr,
+                                         layer._bias_attr, layer._data_format,
+                                         layer._name)
+
+            if layer._weight_attr != False and layer._bias_attr != False:
+                with no_grad():
+                    layer_output.weight = layer.weight
+                    layer_output.bias = layer.bias
+            layer_output._mean = layer._mean
+            layer_output._variance = layer._variance
+
+        for name, sublayer in layer.named_sublayers():
+            layer_output.add_sublayer(name,
+                                      cls.convert_sync_batchnorm(sublayer))
+        del layer
+        return layer_output
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 87fa0caec9ee287c42d8308d9da25c6d2fc9b911..6f6b567849732ff889db4507708758cd8eeab2a8 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -12,198 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-
-from ...fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
-from ...fluid.layers import utils
 from ...fluid.dygraph import layers
 from ...fluid.layer_helper import LayerHelper
 from .. import functional as F
 
 __all__ = [
-    'AdaptiveAvgPool2d',
-    'AdaptiveAvgPool3d',
     'AvgPool1d',
-    'maxPool1d',
-    'AdaptiveMaxPool1d',
-    'AdaptiveAvgPool1d',
     'AvgPool2d',
-    'MaxPool2d',
     'AvgPool3d',
+    'MaxPool1d',
+    'MaxPool2d',
     'MaxPool3d',
+    'AdaptiveAvgPool1d',
+    'AdaptiveAvgPool2d',
+    'AdaptiveAvgPool3d',
+    'AdaptiveMaxPool1d',
+    'AdaptiveMaxPool2d',
+    'AdaptiveMaxPool3d',
 ]
 
 
-class AdaptiveAvgPool2d(layers.Layer):
-    """
-
-    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-
-    For avg adaptive pool2d:
-
-    ..  math::
-
-       hstart &= floor(i * H_{in} / H_{out})
-
-       hend &= ceil((i + 1) * H_{in} / H_{out})
-
-       wstart &= floor(j * W_{in} / W_{out})
-
-       wend &= ceil((j + 1) * W_{in} / W_{out})
-
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
-
-
-    Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two element, (H, W). H and W can be either a int, or None which means
-            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
-            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
-            the order of: [batch_size, input_channels, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-    Shape:
-        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32 or float64.
-        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
-
-    Returns:
-        A callable object of AdaptiveAvgPool2d.
-
-    Examples:
-        .. code-block:: python
-
-            # adaptive avg pool2d
-            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
-            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
-            # of input data into m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(m):
-            #         for j in range(n):
-            #             hstart = floor(i * H / m)
-            #             hend = ceil((i + 1) * H / m)
-            #             wstart = floor(i * W / n)
-            #             wend = ceil((i + 1) * W / n)
-            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
-            #
-            import paddle
-            import numpy as np
-            paddle.disable_static()
-            input_data = np.random.rand(2, 3, 32, 32)
-            x = paddle.to_tensor(input_data)
-            # x.shape is [2, 3, 32, 32]
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
-            pool_out = adaptive_avg_pool(x = x)
-            # pool_out.shape is [2, 3, 3, 3]
-    """
-
-    def __init__(self, output_size, data_format="NCHW", name=None):
-        super(AdaptiveAvgPool2d, self).__init__()
-        self._output_size = output_size
-        self._data_format = data_format
-        self._name = name
-
-    def forward(self, x):
-        return F.adaptive_avg_pool2d(
-            x,
-            output_size=self._output_size,
-            data_format=self._data_format,
-            name=self._name)
-
-
-class AdaptiveAvgPool3d(layers.Layer):
-    """
-
-    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-
-    For avg adaptive pool3d:
-
-    ..  math::
-
-      dstart &= floor(i * D_{in} / D_{out})
-
-      dend &= ceil((i + 1) * D_{in} / D_{out})
-
-      hstart &= floor(j * H_{in} / H_{out})
-
-      hend &= ceil((j + 1) * H_{in} / H_{out})
-
-      wstart &= floor(k * W_{in} / W_{out})
-
-      wend &= ceil((k + 1) * W_{in} / W_{out})
-
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-
-
-    Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
-            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
-            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
-            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-    Shape:
-        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32 or float64.
-        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
-
-    Returns:
-        A callable object of AdaptiveAvgPool3d.
-
-    Examples:
-        .. code-block:: python
-
-            # adaptive avg pool3d
-            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
-            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
-            # of input data into l * m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(l):
-            #         for j in range(m):
-            #             for k in range(n):
-            #                 dstart = floor(i * D / l)
-            #                 dend = ceil((i + 1) * D / l)
-            #                 hstart = floor(j * H / m)
-            #                 hend = ceil((j + 1) * H / m)
-            #                 wstart = floor(k * W / n)
-            #                 wend = ceil((k + 1) * W / n)
-            #                 output[:, :, i, j, k] =
-            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
-            import paddle
-            import numpy as np
-            paddle.disable_static()
-            input_data = np.random.rand(2, 3, 8, 32, 32)
-            x = paddle.to_tensor(input_data)
-            # x.shape is [2, 3, 8, 32, 32]
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
-            pool_out = adaptive_avg_pool(x = x)
-            # pool_out = [2, 3, 3, 3, 3]
-    """
-
-    def __init__(self, output_size, data_format="NCDHW", name=None):
-        super(AdaptiveAvgPool3d, self).__init__()
-        self._output_size = output_size
-        self._data_format = data_format
-        self._name = name
-
-    def forward(self, x):
-        return F.adaptive_avg_pool3d(
-            x,
-            output_size=self._output_size,
-            data_format=self._data_format,
-            name=self._name)
-
-
 class AvgPool1d(layers.Layer):
     """
     This operation applies a 1D average pooling over an input signal composed
@@ -223,17 +51,20 @@ class AvgPool1d(layers.Layer):
 
     Args:
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+            it must contain an integer.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
-            then the input is implicitly zero-padded on both sides for padding number of points.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
+                          mode, default is `True`.
         ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
-            If it is set to False, the floor function will be used. Default False
+            If it is set to False, the floor function will be used. The default value is False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -245,10 +76,14 @@ class AvgPool1d(layers.Layer):
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
+        ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
 
 
+    Shape:
+        - inpuut: 3-D tensor.
+        - output: 3-D tensor
+
     Examples:
 
         .. code-block:: python
@@ -284,63 +119,74 @@ class AvgPool1d(layers.Layer):
         return out
 
 
-class MaxPool1d(layers.Layer):
+class AvgPool2d(layers.Layer):
     """
-    Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
+    This operation applies 2D average pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
 
-    ..  math::
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
 
-       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           $$
 
     Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
-            If it is set to False, the floor function will be used. Default False
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
 
-    Returns:
-        None.
+    Shape:
+        - x: 4-D tensor.
+        - out: 2-D tensor
 
+    Returns: None.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
         ShapeError: If the output's shape calculated is not greater than 0.
-
-
     Examples:
-
         .. code-block:: python
-
           import paddle
           import paddle.nn as nn
+          import numpy as np
           paddle.disable_static()
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
-          pool_out = MaxPool1d(data)
-          # pool_out shape: [1, 3, 16]
-
-          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
-          pool_out, indices = MaxPool1d(data)
-          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          AvgPool2d = nn.AvgPool2d(kernel_size=2,
+                                stride=2, padding=0)
+          output = AvgPoo2d(input)
+          # output.shape [1, 3, 16, 16]
 
     """
 
@@ -348,113 +194,155 @@ class MaxPool1d(layers.Layer):
                  kernel_size,
                  stride=None,
                  padding=0,
-                 return_indices=False,
                  ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCHW",
                  name=None):
-        super(MaxPool1d, self).__init__()
-        self.kernel_size = kernel_size
+        super(AvgPool2d, self).__init__()
+        self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
         self.ceil_mode = ceil_mode
-        self.return_indices = return_indices
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
         self.name = name
 
-    def forward(self, input):
-        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
-                           self.return_indices, self.ceil_mode, self.name)
-        return out
+    def forward(self, x):
+        return F.avg_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
 
 
-class AdaptiveAvgPool1d(layers.Layer):
+class AvgPool3d(layers.Layer):
     """
-
-    This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For average adaptive pool1d:
-
-    ..  math::
-
-       lstart &= floor(i * L_{in} / L_{out})
-
-       lend &= ceil((i + 1) * L_{in} / L_{out})
-
-       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
     Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one int.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
 
-    Returns:
-        None.
-
+    Returns: None.
     Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 5-D tensor.
+        - out: 5-D tensor.
 
     Examples:
         .. code-block:: python
-
-          # average adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-          #
           import paddle
           import paddle.nn as nn
+          import numpy as np
           paddle.disable_static()
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
-          pool_out = AdaptiveAvgPool1d(data)
-          # pool_out shape: [1, 3, 16]
+          # avg pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+          AvgPool3d = nn.AvgPool3d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = AvgPool3d(input)
+          # output.shape [1, 2, 3, 16, 16]
+
     """
 
-    def __init__(self, output_size, name=None):
-        super(AdaptiveAvgPool1d, self).__init__()
-        self.output_size = output_size
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding=0,
+                 ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCDHW",
+                 name=None):
+        super(AvgPool3d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
         self.name = name
 
-    def forward(self, input):
-        return F.adaptive_avg_pool1d(input, self.output_size, self.name)
+    def forward(self, x):
+        return F.avg_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
 
 
-class AdaptiveMaxPool1d(layers.Layer):
+class MaxPool1d(layers.Layer):
     """
-
-    This operation applies a 1D adaptive max pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
+    Applies a 1D max pooling over an input signal composed of several input planes based
+    on the input, output_size, return_indices parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
     size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
 
-    For max adaptive pool1d:
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
 
     ..  math::
 
-       lstart &= floor(i * L_{in} / L_{out})
-
-       lend &= ceil((i + 1) * L_{in} / L_{out})
-
-       Output(i) &= max(Input[lstart:lend])}
+       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
 
     Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-             it must contain one int.
-        return_indices (bool): If true, the index of max pooling point will be returned along
-            with outputs. It cannot be set in average pooling type. Default False.
+       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An integer, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -462,53 +350,60 @@ class AdaptiveMaxPool1d(layers.Layer):
         None.
 
     Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Shape:
+        - x: 3-D tensor.
+        - out: 3-D tensor.
 
     Examples:
+
         .. code-block:: python
 
-          # max adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = max(input[:, :, lstart: lend])
-          #
-                    import paddle
+          import paddle
           import paddle.nn as nn
           paddle.disable_static()
 
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
-          pool_out = AdaptiveMaxPool1d(data)
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
+          pool_out = MaxPool1d(data)
           # pool_out shape: [1, 3, 16]
 
-          # for return_indices = true
-          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
-          pool_out, indices = AdaptiveMaxPool1d(data)
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
+          pool_out, indices = MaxPool1d(data)
           # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
-    def __init__(self, output_size, return_indices=False, name=None):
-        super(AdaptiveMaxPool1d, self).__init__()
-        self.output_size = output_size
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 return_indices=False,
+                 ceil_mode=False,
+                 name=None):
+        super(MaxPool1d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
         self.return_indices = return_indices
         self.name = name
 
     def forward(self, input):
-        return F.adaptive_max_pool1d(input, self.output_size,
-                                     self.return_indices, self.name)
+        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
+                           self.return_indices, self.ceil_mode, self.name)
+        return out
 
 
-class AvgPool2d(layers.Layer):
+class MaxPool2d(layers.Layer):
     """
-    This operation applies 2D average pooling over input features based on the input,
+    This operation applies 2D max pooling over input feature based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
     in NCHW format, where N is batch size, C is the number of channels,
     H is the height of the feature, and W is the width of the feature.
@@ -522,8 +417,9 @@ class AvgPool2d(layers.Layer):
       Output:
            Out shape: $(N, C, H_{out}, W_{out})$
            $$
-           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
            $$
 
     Args:
@@ -532,31 +428,33 @@ class AvgPool2d(layers.Layer):
             Otherwise, the pool kernel size will be a square of an int.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
             it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
+            Otherwise, the pool stride size will be a square of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        return_indices (bool): Whether to return the max indices along with the outputs.
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
 
-    Returns: None.
+    Returns: None
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 4-D tensor.
+        - out: 4-D tensor.
+
     Examples:
         .. code-block:: python
           import paddle
@@ -566,172 +464,72 @@ class AvgPool2d(layers.Layer):
 
           # max pool2d
           input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          AvgPool2d = nn.AvgPool2d(kernel_size=2,
-                                stride=2, padding=0)
-          output = AvgPoo2d(input)
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = MaxPool2d(input)
           # output.shape [1, 3, 16, 16]
 
+          # for return_indices=True
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool2d(input)
+          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
 
     def __init__(self,
                  kernel_size,
                  stride=None,
                  padding=0,
+                 return_indices=False,
                  ceil_mode=False,
-                 count_include_pad=True,
-                 divisor_override=None,
                  data_format="NCHW",
                  name=None):
-        super(AvgPool2d, self).__init__()
+        super(MaxPool2d, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
+        self.return_indices = return_indices
         self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
-        self.divisor = divisor_override
         self.data_format = data_format
         self.name = name
 
     def forward(self, x):
-        return F.avg_pool2d(
+        return F.max_pool2d(
             x,
             kernel_size=self.ksize,
             stride=self.stride,
             padding=self.padding,
-            ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
-            divisor_override=self.divisor,
+            return_indices=self.return_indices,
             data_format=self.data_format,
             name=self.name)
 
 
-class MaxPool2d(layers.Layer):
+class MaxPool3d(layers.Layer):
     """
-    This operation applies 2D max pooling over input feature based on the input,
+    This operation applies 3D max pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
-                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                                                   \text{stride[1]} \times w + n)
-           $$
-
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-    Returns: None
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
-    Examples:
-        .. code-block:: python
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-          paddle.disable_static()
-
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          MaxPool2d = nn.MaxPool2d(kernel_size=2,
-                                   stride=2, padding=0)
-          output = MaxPool2d(input)
-          # output.shape [1, 3, 16, 16]
-
-          # for return_indices=True
-          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
-          output, max_indices = MaxPool2d(input)
-          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
-    """
-
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 return_indices=False,
-                 ceil_mode=False,
-                 data_format="NCHW",
-                 name=None):
-        super(MaxPool2d, self).__init__()
-        self.ksize = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.return_indices = return_indices
-        self.ceil_mode = ceil_mode
-        self.data_format = data_format
-        self.name = name
-
-    def forward(self, x):
-        return F.max_pool2d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            return_indices=self.return_indices,
-            data_format=self.data_format,
-            name=self.name)
-
-
-class MaxPool3d(layers.Layer):
-    """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
     Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int. Default kernel_size.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): when True, will use ceil instead of floor to compute the output shape.
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -742,6 +540,11 @@ class MaxPool3d(layers.Layer):
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 5-D tensor.
+        - out: 5-D tensor.
+
     Examples:
         .. code-block:: python
           import paddle
@@ -790,88 +593,457 @@ class MaxPool3d(layers.Layer):
             name=self.name)
 
 
-class AvgPool3d(layers.Layer):
+class AdaptiveAvgPool1d(layers.Layer):
     """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    This operation applies a 1D adaptive average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For average adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
 
     Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
-            is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one int.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
 
-    Returns: None.
+    Returns:
+        None.
+
     Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Shape:
+        - x: 3-D tensor.
+        - out: 3-D tensor.
+
     Examples:
         .. code-block:: python
+
+          # average adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+          #
           import paddle
           import paddle.nn as nn
-          import numpy as np
           paddle.disable_static()
 
-          # avg pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          AvgPool3d = nn.AvgPool3d(kernel_size=2,
-                                   stride=2, padding=0)
-          output = AvgPool3d(input)
-          # output.shape [1, 2, 3, 16, 16]
-
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
+          pool_out = AdaptiveAvgPool1d(data)
+          # pool_out shape: [1, 3, 16]
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride,
-                 padding=0,
-                 ceil_mode=False,
-                 count_include_pad=True,
-                 divisor_override=None,
-                 data_format="NCDHW",
-                 name=None):
-        super(AvgPool3d, self).__init__()
-        self.ksize = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
-        self.divisor = divisor_override
-        self.data_format = data_format
+    def __init__(self, output_size, name=None):
+        super(AdaptiveAvgPool1d, self).__init__()
+        self.output_size = output_size
         self.name = name
 
-    def forward(self, x):
-        return F.avg_pool3d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
-            divisor_override=self.divisor,
-            data_format=self.data_format,
-            name=self.name)
+    def forward(self, input):
+        return F.adaptive_avg_pool1d(input, self.output_size, self.name)
+
+
+class AdaptiveAvgPool2d(layers.Layer):
+    """
+
+    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two element, (H, W). H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
+            the order of: [batch_size, input_channels, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool2d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out.shape is [2, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCHW", name=None):
+        super(AdaptiveAvgPool2d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool2d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AdaptiveAvgPool3d(layers.Layer):
+    """
+
+    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
+            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool3d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 8, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out = [2, 3, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCDHW", name=None):
+        super(AdaptiveAvgPool3d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool3d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AdaptiveMaxPool1d(layers.Layer):
+    """
+
+    This operation applies a 1D adaptive max pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For max adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= max(Input[lstart:lend])}
+
+    Args:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+             it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Returns:
+        None.
+
+    Raises:
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type is same as input x.
+
+    Examples:
+        .. code-block:: python
+
+          # max adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = max(input[:, :, lstart: lend])
+          #
+                    import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
+          pool_out = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+          # for return_indices = true
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
+          pool_out, indices = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool1d, self).__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
+        self.name = name
+
+    def forward(self, input):
+        return F.adaptive_max_pool1d(input, self.output_size,
+                                     self.return_indices, self.name)
+
+
+class AdaptiveMaxPool2d(layers.Layer):
+    """
+    This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    For adaptive max pool2d:
+    ..  math::
+       hstart &= floor(i * H_{in} / H_{out})
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+       wstart &= floor(j * W_{in} / W_{out})
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+       Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
+        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x.
+    
+    Returns:
+        A callable object of AdaptiveMaxPool2d.
+    Examples:
+        .. code-block:: python
+            # adaptive max pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=3, return_indices=True)
+            pool_out, indices = adaptive_max_pool(x = x)
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool2d, self).__init__()
+        self._output_size = output_size
+        self._return_indices = return_indices
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_max_pool2d(
+            x,
+            output_size=self._output_size,
+            return_indices=self._return_indices,
+            name=self._name)
+
+
+class AdaptiveMaxPool3d(layers.Layer):
+    """
+   This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    For adaptive max pool3d:
+    ..  math::
+      dstart &= floor(i * D_{in} / D_{out})
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+      hstart &= floor(j * H_{in} / H_{out})
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+      wstart &= floor(k * W_{in} / W_{out})
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+      Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type is same as input x.
+    Returns:
+        A callable object of AdaptiveMaxPool3d.
+    Examples:
+        .. code-block:: python
+            # adaptive max pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     max(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            pool = paddle.nn.AdaptiveMaxPool3d(output_size=4)
+            out = pool(x)
+            # out shape: [2, 3, 4, 4, 4]
+            pool, indices = paddle.nn.AdaptiveMaxPool3d(output_size=3, return_indices=True)
+            out = pool(x)
+            # out shape: [2, 3, 4, 4, 4], indices shape: [2, 3, 4, 4, 4]
+            
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool3d, self).__init__()
+        self._output_size = output_size
+        self._return_indices = return_indices
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_max_pool3d(
+            x,
+            output_size=self._output_size,
+            return_indices=self._return_indices,
+            name=self._name)
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 49314c9832dd389411dffb3f498b34d09337a3f0..095a34cb6fc68cda6900790141d226208b203f82 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -26,9 +26,8 @@ __all__ = [
 ]
 
 
-from ..fluid.optimizer import  SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \
-            Ftrl, Adadelta, \
-            SGDOptimizer, MomentumOptimizer, AdagradOptimizer,DpsgdOptimizer,\
+from ..fluid.optimizer import Momentum, Adagrad, Dpsgd, DecayedAdagrad, Ftrl,\
+            AdagradOptimizer,DpsgdOptimizer,\
             DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
             ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
             ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
@@ -39,6 +38,9 @@ from .adam import Adam
 from .adamw import AdamW
 from .adamax import Adamax
 from .rmsprop import RMSProp
+from .adadelta import Adadelta
+from .sgd import SGD
+from .momentum import Momentum
 
 from . import lr_scheduler
 from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..bba2c11ea07490804573189bac8b315dfc80fd37
--- /dev/null
+++ b/python/paddle/optimizer/adadelta.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+
+__all__ = ["Adadelta"]
+
+
+class Adadelta(Optimizer):
+    """
+    **Notes: This API does not support sparse parameter optimization.**
+
+    Adadelta Optimizer. Please refer to this for details:
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.
+
+    The update is done as follows:
+
+    .. math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2
+
+        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) }
+
+        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
+
+    Args:
+	learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
+        rho (float): a floating point value indicating the decay rate. Default 0.95.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization. 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            adadelta = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            adadelta.step()
+            adadelta.clear_grad()
+
+    """
+
+    _avg_squared_grad_acc_str = "_avg_squared_grad"
+    _avg_squared_update_acc_str = "_avg_squared_update"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 epsilon=1.0e-6,
+                 rho=0.95,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        super(Adadelta, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "adadelta"
+        self._epsilon = epsilon
+        self._rho = rho
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._avg_squared_grad_acc_str, p)
+            self._add_accumulator(self._avg_squared_update_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        avg_squared_grad_acc = self._get_accumulator(
+            self._avg_squared_grad_acc_str, param_and_grad[0])
+        avg_squared_update_acc = self._get_accumulator(
+            self._avg_squared_update_acc_str, param_and_grad[0])
+
+        # Create the adadelta optimizer op
+        adadelta_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "AvgSquaredGrad": avg_squared_grad_acc,
+                "AvgSquaredUpdate": avg_squared_update_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "AvgSquaredGradOut": avg_squared_grad_acc,
+                "AvgSquaredUpdateOut": avg_squared_update_acc
+            },
+            attrs={"epsilon": self._epsilon,
+                   "rho": self._rho},
+            stop_gradient=True)
+
+        return adadelta_op
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
new file mode 100644
index 0000000000000000000000000000000000000000..87fa86c17615ef8cc455e95517608a246d677e74
--- /dev/null
+++ b/python/paddle/optimizer/momentum.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+
+__all__ = ["Momentum"]
+
+
+class Momentum(Optimizer):
+    """
+
+    Simple Momentum optimizer with velocity state
+
+    This optimizer has a flag for Nestrov Momentum.
+
+    The update equations are as follows:
+
+    .. math::
+
+        & velocity = mu * velocity + gradient
+
+        & if (use\_nesterov):
+
+        &\quad   param = param - (gradient + mu * velocity) * learning\_rate
+
+        & else:
+
+        &\quad   param = param - learning\_rate * velocity
+
+    Parameters:
+
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        momentum (float): Momentum factor. The default value is 0.9.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            momentum = paddle.optimizer.Momentum(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            momentum.step()
+            momentum.clear_grad()
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 momentum=0.9,
+                 parameters=None,
+                 use_nesterov=False,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set")
+        if momentum is None:
+            raise ValueError("momentum is not set")
+        super(Momentum, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "momentum"
+        self._momentum = momentum
+        self._use_nesterov = bool(use_nesterov)
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(self._velocity_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        lr = self._create_param_lr(param_and_grad)
+
+        if framework.in_dygraph_mode():
+            _, _ = core.ops.momentum(param_and_grad[0], param_and_grad[1],
+                                     velocity_acc, lr, param_and_grad[0],
+                                     velocity_acc, 'mu', self._momentum,
+                                     'use_nesterov', self._use_nesterov)
+            return None
+
+        attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov}
+        inputs = {
+            "Param": [param_and_grad[0]],
+            "Grad": [param_and_grad[1]],
+            "Velocity": [velocity_acc],
+            "LearningRate": [lr]
+        }
+
+        outputs = {
+            "ParamOut": [param_and_grad[0]],
+            "VelocityOut": [velocity_acc]
+        }
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True)
+
+        return momentum_op
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb3a578e15724e9501d69dc209bdedc65afeb82b
--- /dev/null
+++ b/python/paddle/optimizer/sgd.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+from ..fluid.dygraph import no_grad
+__all__ = ["SGD"]
+
+
+class SGD(Optimizer):
+    """
+    Optimizer of the stochastic gradient descent algorithm.
+
+    .. math::
+
+        param\_out = param - learning\_rate * grad
+
+    Parameters:
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` . 
+        
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            sgd.step()
+            sgd.clear_grad()
+
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set")
+        super(SGD, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "sgd"
+
+    @no_grad()
+    def _append_optimize_op(self, block, param_and_grad):
+        lr = self._create_param_lr(param_and_grad)
+        if framework.in_dygraph_mode():
+            core.ops.sgd(param_and_grad[0], lr, param_and_grad[1],
+                         param_and_grad[0])
+            return None
+
+        assert isinstance(block, framework.Block)
+        # create the optimize op
+        sgd_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": lr
+            },
+            outputs={"ParamOut": param_and_grad[0]},
+            stop_gradient=True)
+
+        return sgd_op
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 0d87c1c2cf705372de7b8534cf8faea1bb5320a6..d2db2a7cb71945e137e46d6793f8cba1f7adf12f 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1618,6 +1618,10 @@ def clip(x, min=None, max=None, name=None):
     fmax = float(np.finfo(np_dtype).max)
 
     if in_dygraph_mode():
+        if isinstance(min, Variable):
+            min = min.numpy().item(0)
+        if isinstance(max, Variable):
+            max = max.numpy().item(0)
         min = fmin if min is None else min
         max = fmax if max is None else max
         return core.ops.clip(x, "min", min, "max", max)
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index c652d0f1891c8bd0a4c85ea777527a2fd82ad11b..6b08599fad1dfc6b5d60c3798bba802a5ddefd02 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -94,7 +94,7 @@ def bernoulli(x, name=None):
     return out
 
 
-def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
+def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
     """
     This OP returns a Tensor filled with random values sampled from a Gaussian
     distribution, with ``shape`` and ``dtype``.
@@ -109,9 +109,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
         std(float|int, optional): Standard deviation of the output tensor, default
             is 1.0.
         seed(int, optional): ${seed_comment}
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
-            the output Tensor. Supported data types: float32, float64.
-            Default is float32.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
@@ -120,6 +121,13 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
         Tensor: A Tensor filled with random values sampled from a Gaussian
         distribution, with ``shape`` and ``dtype``. 
     """
+    if dtype is None:
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "gaussian_random only supports [float32, float64], but the default dtype is %s"
+                % dtype)
+
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
     seed = 0
@@ -169,9 +177,10 @@ def standard_normal(shape, dtype=None, name=None):
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
-            output tensor. Supported data types: float32, float64. If ``dytpe``
-            is None, the data type is float32. Default is None.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -216,7 +225,11 @@ def standard_normal(shape, dtype=None, name=None):
 
     """
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "standard_normal only supports [float32, float64], but the default dtype is %s"
+                % dtype)
 
     return gaussian_random(
         shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
@@ -325,7 +338,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
     return out
 
 
-def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):
+def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     """
     This OP returns a Tensor filled with random values sampled from a uniform
     distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
@@ -343,9 +356,10 @@ def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype, optional): The data type of
-            the output Tensor. Supported data types: float32, float64.
-            Default is float32.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
         min(float|int, optional): The lower bound on the range of random values
             to generate, ``min`` is included in the range. Default is -1.0.
         max(float|int, optional): The upper bound on the range of random values
@@ -401,6 +415,13 @@ def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):
 
 
     """
+    if dtype is None:
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "uniform only supports [float32, float64], but the default dtype is %s"
+                % dtype)
+
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
@@ -447,7 +468,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64). Default is [1].
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+        dtype(str|np.dtype, optional): The data type of the
             output tensor. Supported data types: int32, int64. If ``dytpe``
             is None, the data type is int64. Default is None.
         name(str, optional): The default value is None.  Normally there is no
@@ -550,7 +571,7 @@ def randperm(n, dtype="int64", name=None):
 
     Args:
         n(int): The upper bound (exclusive), and it should be greater than 0.
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
+        dtype(str|np.dtype, optional): The data type of
             the output Tensor. Supported data types: int32, int64, float32,
             float64. Default is int64.
         name(str, optional): The default value is None. Normally there is no
@@ -622,9 +643,10 @@ def rand(shape, dtype=None, name=None):
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
-            output tensor. Supported data types: float32, float64. If ``dytpe``
-            is None, the data type is float32. Default is None.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
@@ -668,7 +690,11 @@ def rand(shape, dtype=None, name=None):
 
     """
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "rand only supports [float32, float64], but the default dtype is %s"
+                % dtype)
 
     out = uniform(shape, dtype, min=0.0, max=1.0, name=name)
     out.stop_gradient = True
diff --git a/tools/summary_env.py b/tools/summary_env.py
index 0252d9adcd07255e69a1abd81c7704eda02745b8..39d6acaf536c533a218d3d53b596c469ab19922d 100644
--- a/tools/summary_env.py
+++ b/tools/summary_env.py
@@ -55,7 +55,7 @@ def get_os_info():
     else:
         plat = None
         ver = None
-    envs['os_info'] = "{} {}".format(plat, ver)
+    envs['os_info'] = "{0} {1}".format(plat, ver)
 
 
 def get_python_info():
@@ -93,7 +93,7 @@ def get_cudnn_info():
         if cudnn_dll_path:
             cudnn_header_path = cudnn_dll_path.split('bin')[
                 0] + 'include\cudnn.h'
-            cmd = 'type "{}" | findstr "{}" | findstr /v "CUDNN_VERSION"'
+            cmd = 'type "{0}" | findstr "{1}" | findstr /v "CUDNN_VERSION"'
         else:
             envs['cudnn_version'] = None
             return
@@ -102,7 +102,7 @@ def get_cudnn_info():
             'whereis "cudnn.h" | awk \'{print $2}\'')
         if cudnn_header_path:
             cudnn_header_path = cudnn_header_path.strip()
-            cmd = 'cat "{}" | grep "{}" | grep -v "CUDNN_VERSION"'
+            cmd = 'cat "{0}" | grep "{1}" | grep -v "CUDNN_VERSION"'
         else:
             envs['cudnn_version'] = None
             return
@@ -112,7 +112,7 @@ def get_cudnn_info():
     patch_level = _get_cudnn_ver(
         cmd.format(cudnn_header_path, 'CUDNN_PATCHLEVEL'))
 
-    envs['cudnn_version'] = "{}.{}.{}".format(major, minor, patch_level)
+    envs['cudnn_version'] = "{0}.{1}.{2}".format(major, minor, patch_level)
 
 
 def get_driver_info():
@@ -132,7 +132,7 @@ def main():
     get_cuda_info()
     get_cudnn_info()
     get_driver_info()
-    print(envs_template.format(**envs))
+    print('*' * 40 + envs_template.format(**envs) + '*' * 40)
 
 
 if __name__ == '__main__':