Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize/large_scale_kv_spped

fef6f6f9 · seiriosPlus · 3135fbcc · 9a1ea9b4 · fef6f6f9 · fef6f6f9
111 changed file
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
+#include <cmath>
 #include <functional>
 #include <string>
 #include <vector>
@@ -74,12 +75,17 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
  auto weights_shape = weights->dims();
  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
+  auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());

-  EigenMatrixArrayMap weights_array_2d(
-      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
+  EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
                                       weights_shape_2d[1]);

  weights_array_2d.colwise() *= scale_array;
+
+  // Check for subnormal values that slows down convolution execution
+  for (int i = 0; i < weights->numel(); ++i) {
+    if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
+  }
 }

 void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
@@ -108,13 +114,6 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {

    GET_CONV_BN_NODES(conv_ac_pattern);

-    // check if fuse can be done and if MKL-DNN should be used
-    FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
-    if (fuse_option == DO_NOT_FUSE) {
-      VLOG(3) << "do not perform conv+affinechannel fuse";
-      return;
-    }
-
    // Create eltwise_y (conv bias) variable
    VarDesc eltwise_y_in_desc(
        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
@@ -143,6 +142,7 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
    desc.SetType("elementwise_add");
    desc.SetAttr("axis", 1);
+    desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.

    GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
@@ -103,8 +102,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
                                  // params_file_ fields.

  CP_MEMBER(opt_cache_dir_);
-  prog_file_ = std::move(other.prog_file_);
-  params_file_ = std::move(other.params_file_);
+  CP_MEMBER(prog_file_);
+  CP_MEMBER(params_file_);

  CP_MEMBER(use_fc_padding_);
  // GPU related.

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -32,7 +32,6 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -517,6 +516,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
+  // TODO(NHZlX): Should add the link to the doc of
+  // paddle_infer::CreatePredictor<paddle_infer::Config>
  if (config.glog_info_disabled()) {
    FLAGS_logtostderr = 1;
    FLAGS_minloglevel = 2;  // GLOG_ERROR
@@ -1058,3 +1059,122 @@ USE_TRT_CONVERTER(skip_layernorm);
 USE_TRT_CONVERTER(slice);
 USE_TRT_CONVERTER(scale);
 #endif
+
+namespace paddle_infer {
+
+void Tensor::Reshape(const std::vector<int> &shape) { tensor_->Reshape(shape); }
+
+std::vector<int> Tensor::shape() const { return tensor_->shape(); }
+
+void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+  return tensor_->SetLoD(x);
+}
+
+std::vector<std::vector<size_t>> Tensor::lod() const { return tensor_->lod(); }
+
+const std::string &Tensor::name() const { return tensor_->name(); }
+
+DataType Tensor::type() const { return tensor_->type(); }
+
+Predictor::Predictor(const Config &config) {
+  const_cast<Config *>(&config)->SwitchUseFeedFetchOps(false);
+  // The second parameter indicates that the discard log is not printed
+  predictor_ = paddle::CreatePaddlePredictor<
+      Config, paddle::PaddleEngineKind::kAnalysis>(config);
+}
+
+std::vector<std::string> Predictor::GetInputNames() {
+  return predictor_->GetInputNames();
+}
+
+std::unique_ptr<Tensor> Predictor::GetInputHandle(const std::string &name) {
+  auto zero_copy_tensor = predictor_->GetInputTensor(name);
+  std::unique_ptr<Tensor> tensor(new Tensor(std::move(zero_copy_tensor)));
+  return tensor;
+}
+
+std::vector<std::string> Predictor::GetOutputNames() {
+  return predictor_->GetOutputNames();
+}
+
+std::unique_ptr<Tensor> Predictor::GetOutputHandle(const std::string &name) {
+  auto zero_copy_tensor = predictor_->GetOutputTensor(name);
+  std::unique_ptr<Tensor> tensor(new Tensor(std::move(zero_copy_tensor)));
+  return tensor;
+}
+
+bool Predictor::Run() { return predictor_->ZeroCopyRun(); }
+
+std::unique_ptr<Predictor> Predictor::Clone() {
+  auto analysis_pred = predictor_->Clone();
+  std::unique_ptr<Predictor> pred(new Predictor(std::move(analysis_pred)));
+  return pred;
+}
+
+void Predictor::ClearIntermediateTensor() {
+  predictor_->ClearIntermediateTensor();
+}
+
+int GetNumBytesOfDataType(DataType dtype) {
+  switch (dtype) {
+    case DataType::FLOAT32:
+      return sizeof(float);
+    case DataType::INT64:
+      return sizeof(int64_t);
+    case DataType::INT32:
+      return sizeof(int32_t);
+    case DataType::UINT8:
+      return sizeof(uint8_t);
+    default:
+      assert(false);
+      return -1;
+  }
+}
+
+std::string GetVersion() { return paddle::get_version(); }
+
+std::string UpdateDllFlag(const char *name, const char *value) {
+  return paddle::UpdateDllFlag(name, value);
+}
+
+}  // namespace paddle_infer
+
+namespace paddle_infer {
+std::shared_ptr<Predictor> CreatePredictor(const Config &config) {  // NOLINT
+  std::shared_ptr<Predictor> predictor(new Predictor(config));
+  return predictor;
+}
+
+namespace services {
+PredictorPool::PredictorPool(const Config &config, size_t size) {
+  PADDLE_ENFORCE_GE(
+      size, 1UL,
+      paddle::platform::errors::InvalidArgument(
+          "The predictor pool size should be greater than 1, but it's (%d)",
+          size));
+  Config copy_config(config);
+  main_pred_.reset(new Predictor(config));
+  for (size_t i = 0; i < size - 1; i++) {
+    if (config.tensorrt_engine_enabled()) {
+      Config config_tmp(copy_config);
+      preds_.push_back(
+          std::move(std::unique_ptr<Predictor>(new Predictor(config_tmp))));
+    } else {
+      preds_.push_back(std::move(main_pred_->Clone()));
+    }
+  }
+}
+
+Predictor *PredictorPool::Retrive(size_t idx) {
+  PADDLE_ENFORCE_LT(
+      idx, preds_.size() + 1,
+      paddle::platform::errors::InvalidArgument(
+          "There are (%d) predictors in the pool, but the idx is (%d)", idx,
+          preds_.size() + 1));
+  if (idx == 0) {
+    return main_pred_.get();
+  }
+  return preds_[idx - 1].get();
+}
+}  // namespace services
+}  // namespace paddle_infer
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -112,6 +112,12 @@ void PaddleBuf::Free() {
  }
 }

+NativeConfig::NativeConfig() {
+  LOG(WARNING) << "The paddle::NativeConfig interface is going to be "
+                  "deprecated in the next release, plase use the latest "
+                  "paddle_infer::Config instead.";
+}
+
 std::string get_version() {
  std::stringstream ss;
  ss << "version: " << framework::paddle_version() << "\n";

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <set>
 #include <sstream>
 #include <string>
@@ -25,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -311,6 +313,8 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
+  // TODO(NHZlX): Should add the link to the doc of
+  // paddle_infer::CreatePredictor<paddle_infer::Config>
  VLOG(3) << "create NativePaddlePredictor";
  if (config.use_gpu) {
    // 1. GPU memory

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -347,6 +347,7 @@ class PD_INFER_DECL PaddlePredictor {
 /// place of inference, etc.)
 ///
 struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
+  NativeConfig();
  /// GPU related fields.
  bool use_gpu{false};
  int device{0};
@@ -421,7 +422,8 @@ enum class PaddleEngineKind {
 };

 template <typename ConfigT, PaddleEngineKind engine>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
+    const ConfigT& config);

 template <>
 PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
@@ -437,6 +439,4 @@ PD_INFER_DECL std::string get_version();

 PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);

-PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
-    const std::string& config_file);
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -22,9 +22,124 @@ limitations under the License. */
 #pragma once

 #include <cassert>
+#include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>

 #include "paddle_analysis_config.h"  // NOLINT
 #include "paddle_api.h"              // NOLINT
+
+namespace paddle_infer {
+using DataType = paddle::PaddleDType;
+using PlaceType = paddle::PaddlePlace;
+using PrecisionType = paddle::AnalysisConfig::Precision;
+using Config = paddle::AnalysisConfig;
+
+class PD_INFER_DECL Tensor {
+ public:
+  // Can only be created by predictor->GetInputHandle(cosnt std::string& name)
+  // or predictor->GetOutputHandle(cosnt std::string& name)
+  Tensor() = delete;
+  explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
+      : tensor_(std::move(tensor)) {}
+  void Reshape(const std::vector<int>& shape);
+
+  template <typename T>
+  void CopyFromCpu(const T* data);
+
+  // should add the place
+  template <typename T>
+  T* mutable_data(PlaceType place);
+
+  template <typename T>
+  void CopyToCpu(T* data);
+
+  template <typename T>
+  T* data(PlaceType* place, int* size) const;
+
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  std::vector<std::vector<size_t>> lod() const;
+
+  DataType type() const;
+
+  std::vector<int> shape() const;
+  const std::string& name() const;
+
+ private:
+  std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
+};
+
+class PD_INFER_DECL Predictor {
+ public:
+  Predictor() = default;
+  ~Predictor() {}
+  // Use for clone
+  explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
+      : predictor_(std::move(pred)) {}
+
+  explicit Predictor(const Config& config);
+
+  std::vector<std::string> GetInputNames();
+  std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
+
+  bool Run();
+
+  std::vector<std::string> GetOutputNames();
+  std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
+
+  std::unique_ptr<Predictor> Clone();
+  void ClearIntermediateTensor();
+
+ private:
+  std::unique_ptr<paddle::PaddlePredictor> predictor_;
+};
+
+PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
+    const Config& config);  // NOLINT
+PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
+
+PD_INFER_DECL std::string GetVersion();
+PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
+
+template <typename T>
+void Tensor::CopyFromCpu(const T* data) {
+  tensor_->copy_from_cpu<T>(data);
+}
+
+template <typename T>
+void Tensor::CopyToCpu(T* data) {
+  return tensor_->copy_to_cpu<T>(data);
+}
+
+template <typename T>
+T* Tensor::mutable_data(PlaceType place) {
+  return tensor_->mutable_data<T>(place);
+}
+
+template <typename T>
+T* Tensor::data(PlaceType* place, int* size) const {
+  return tensor_->data<T>(place, size);
+}
+
+}  // namespace paddle_infer
+
+namespace paddle_infer {
+namespace services {
+
+class PD_INFER_DECL PredictorPool {
+ public:
+  PredictorPool() = delete;
+  PredictorPool(const PredictorPool&) = delete;
+  PredictorPool& operator=(const PredictorPool&) = delete;
+
+  explicit PredictorPool(const Config& config, size_t size = 1);
+  Predictor* Retrive(size_t idx);
+
+ private:
+  std::shared_ptr<Predictor> main_pred_;
+  std::vector<std::unique_ptr<Predictor>> preds_;
+};
+}  // namespace services
+}  // namespace paddle_infer
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -188,6 +188,8 @@ void CpuPassStrategy::EnableMKLDNN() {
             "depthwise_conv_mkldnn_pass",     //
             "conv_bn_fuse_pass",              // Execute BN passes again to
             "conv_eltwiseadd_bn_fuse_pass",   // preserve correct pass order
+             "conv_affine_channel_fuse_pass",  //
+             "conv_eltwiseadd_affine_channel_fuse_pass",  //
             "conv_transpose_bn_fuse_pass",               //
             "conv_transpose_eltwiseadd_bn_fuse_pass",    //
             "conv_bias_mkldnn_fuse_pass",                //

--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -54,7 +54,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
    auto ptr = new SkipLayerNormPluginDynamic(
        bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
    ptr->bias_gpu_ = bias_gpu_;
-    ptr->scale_gpu_ = bias_gpu_;
+    ptr->scale_gpu_ = scale_gpu_;
    return ptr;
  }


--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -515,3 +515,9 @@ if(WITH_MKLDNN)
 inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc 
        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
        ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
+
+if(WITH_GPU)
+  inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${RESNET50_MODEL_DIR})
+endif()
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -72,3 +72,59 @@ TEST(AnalysisPredictor, use_gpu) {

 }  // namespace inference
 }  // namespace paddle
+
+namespace paddle_infer {
+
+TEST(Predictor, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  Config config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableLiteEngine(PrecisionType::kFloat32);
+
+  auto predictor = CreatePredictor(config);
+  const int batch = 1;
+  const int channel = 3;
+  const int height = 318;
+  const int width = 318;
+  const int input_num = batch * channel * height * width;
+  std::vector<float> input(input_num, 1);
+
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+
+  input_t->Reshape({1, 3, 318, 318});
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  size_t out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                   std::multiplies<int>());
+
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+
+  const std::vector<float> truth_values = {
+      127.780396f, 738.16656f,  1013.2264f,  -438.17206f, 366.4022f,
+      927.66187f,  736.2241f,   -633.68567f, -329.92737f, -430.15637f,
+      -633.0639f,  -146.54858f, -1324.2804f, -1349.3661f, -242.67671f,
+      117.44864f,  -801.7251f,  -391.51495f, -404.8202f,  454.16132f,
+      515.48206f,  -133.03114f, 69.293076f,  590.09753f,  -1434.6917f,
+      -1070.8903f, 307.0744f,   400.52573f,  -316.12177f, -587.1265f,
+      -161.05742f, 800.3663f,   -96.47157f,  748.708f,    868.17645f,
+      -447.9403f,  112.73656f,  1127.1992f,  47.43518f,   677.7219f,
+      593.1881f,   -336.4011f,  551.3634f,   397.82474f,  78.39835f,
+      -715.4006f,  405.96988f,  404.25684f,  246.01978f,  -8.430191f,
+      131.36617f,  -648.0528f};
+
+  float* data_o = out_data.data();
+  for (size_t j = 0; j < out_num; j += 10) {
+    EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0.,
+                10e-5);
+  }
+}
+
+}  // namespace paddle_infer
--- a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda_runtime.h>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cstring>
+#include <numeric>
+
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+
+namespace paddle_infer {
+
+TEST(Predictor, use_gpu) {
+  LOG(INFO) << GetVersion();
+  UpdateDllFlag("conv_workspace_size_limit", "4000");
+  std::string model_dir = FLAGS_infer_model + "/model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableUseGpu(100, 0);
+
+  auto predictor = CreatePredictor(config);
+  auto pred_clone = predictor->Clone();
+
+  std::vector<int> in_shape = {1, 3, 318, 318};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+
+  std::vector<float> input(in_num, 0);
+
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+  predictor->ClearIntermediateTensor();
+}
+
+TEST(PredictorPool, basic) {
+  LOG(INFO) << GetVersion();
+  UpdateDllFlag("conv_workspace_size_limit", "4000");
+  std::string model_dir = FLAGS_infer_model + "/model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableUseGpu(100, 0);
+
+  services::PredictorPool pred_pool(config, 4);
+  auto pred = pred_pool.Retrive(2);
+
+  std::vector<int> in_shape = {1, 3, 318, 318};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+  std::vector<float> input(in_num, 0);
+
+  auto in_names = pred->GetInputNames();
+  auto input_t = pred->GetInputHandle(in_names[0]);
+  input_t->name();
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  pred->Run();
+  auto out_names = pred->GetOutputNames();
+  auto output_t = pred->GetOutputHandle(out_names[0]);
+  auto out_type = output_t->type();
+  LOG(INFO) << GetNumBytesOfDataType(out_type);
+  if (out_type == DataType::FLOAT32) {
+    PlaceType place;
+    int size;
+    output_t->data<float>(&place, &size);
+  }
+}
+
+}  // namespace paddle_infer
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
@@ -41,7 +41,7 @@ TEST(AnalysisPredictor, use_gpu) {
  SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");

  std::vector<PaddleTensor> outputs;
-  for (auto& input : inputs_all) {
+  for (auto &input : inputs_all) {
    ASSERT_TRUE(predictor->Run(input, &outputs));
    predictor->ClearIntermediateTensor();
  }
@@ -49,3 +49,27 @@ TEST(AnalysisPredictor, use_gpu) {

 }  // namespace inference
 }  // namespace paddle
+
+namespace paddle_infer {
+TEST(PredictorPool, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  Config config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
+  config.EnableTensorRtEngine();
+  services::PredictorPool pred_pool(config, 1);
+
+  auto predictor = pred_pool.Retrive(0);
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  std::vector<int> in_shape = {1, 3, 224, 224};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+
+  std::vector<float> input(in_num, 0);
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+}
+
+}  // namespace paddle_infer
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>

+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
@@ -1231,3 +1232,24 @@ REGISTER_OP_CPU_KERNEL(
    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
                              ops::AbsGradFunctor<int64_t>>);
 /* ========================================================================== */
+
+/* ==========================  register checkpoint ===========================*/
+REGISTER_OP_VERSION(leaky_relu)
+    .AddCheckpoint(
+        R"ROC(fix leaky_relu, bahavior changed when alpha < 0 or alpha > 1)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .BugfixWithBehaviorChanged(
+                "leaky_relu calculate formula before checkponit: out = max(x, "
+                "alpha * x); after checkpoint: out = x if x > 0 else alpha * "
+                "x"));
+
+REGISTER_OP_VERSION(hard_shrink)
+    .AddCheckpoint(
+        R"ROC(fix hard_shrink, bahavior changed when threshold<0)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .BugfixWithBehaviorChanged(
+                "hard_shrink calculate formula before checkponit: out = x * "
+                "((x < -threshold) + (x > threshold)); after checkpoint: out = "
+                "x * (((x < -threshold) + (x > threshold)) > 0)"));
+
+/* ========================================================================== */
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"

 #ifdef PADDLE_WITH_MKLDNN
@@ -567,3 +568,14 @@ REGISTER_OP_CPU_KERNEL(
    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                     double>);
+
+REGISTER_OP_VERSION(conv_transpose)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade convtranspose add a new attribute [output_padding].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "output_padding",
+            "In order to add additional size to one side of each dimension "
+            "in the output",
+            {}));
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -56,7 +56,7 @@ endif()


 cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op)
+    DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op scale_op)
 cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)

--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -132,6 +132,15 @@ void ProcGetResponse(const VarHandle& var_h,
                            &trainer_id);
 }

+void ProcGetRecvResponse(const VarHandle& var_h,
+                         const ::grpc::ByteBuffer& ret_msg) {
+  VLOG(4) << "ProcGetRecvResponse";
+  framework::Variable* outvar = nullptr;
+  int trainer_id;
+  DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
+                                &trainer_id);
+}
+
 template <typename T>
 void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
  ::grpc::Slice slice(proto.ByteSizeLong());
@@ -482,6 +491,79 @@ VarHandlePtr GRPCClient::AsyncDistributeNotify(
  return h;
 }

+VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep,
+                                          const platform::DeviceContext& ctx,
+                                          const framework::Scope& scope,
+                                          const std::string& send_var_name,
+                                          const std::string& recv_var_name,
+                                          const std::string& table_name,
+                                          int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string send_var_name_val = send_var_name;
+  const std::string recv_var_name_val = recv_var_name;
+  const std::string table_name_val = table_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+  const std::string method = kSendAndRecvRPC;
+  VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: "
+          << send_var_name_val << " Recv_var_name: " << recv_var_name_val;
+  int retry_times_ = 0;
+
+  while (true) {
+    SendAndRecvProcessor* s = new SendAndRecvProcessor(ch);
+    VarHandlePtr h(
+        new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope));
+    VarHandlePtr h_recv(
+        new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope));
+    s->Prepare(h, time_out);
+    s->RecvPrepare(h_recv);
+
+    framework::AsyncIO([send_var_name_val, recv_var_name_val, table_name_val,
+                        p_scope, p_ctx, s, method, h, this] {
+      auto* send_var = p_scope->FindVar(send_var_name_val);
+      send_var->GetMutable<framework::LoDTensor>()->set_lod({});
+      ::grpc::ByteBuffer buf;
+      VLOG(4) << "SerializeToByteBuffer: send_var_name_val: "
+              << send_var_name_val
+              << " recv_var_name_val: " << recv_var_name_val;
+      SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf,
+                            recv_var_name_val, trainer_id_, table_name_val);
+
+      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+
+      // stub context
+      s->response_call_back_ = ProcGetRecvResponse;
+
+      platform::RecordRPCEvent record_event(method);
+
+      auto call = s->stub_g_.PrepareUnaryCall(
+          s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable",
+          buf, &cq_);
+      call->StartCall();
+      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+
+      if (UNLIKELY(platform::IsProfileEnabled())) {
+        h->Wait();
+      }
+    });
+    req_count_++;
+
+    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
+      h->Wait();
+      if (h->should_retry) {
+        VLOG(3) << "rpc call failed, retry times " << retry_times_;
+        retry_times_++;
+        std::random_device rd;
+        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
+        continue;
+      }
+    }
+
+    return h;
+  }
+}
+
 bool GRPCClient::Wait() {
  std::unique_lock<std::mutex> lk(sync_mutex_);
  sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });

--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -53,6 +53,8 @@ namespace distributed {

 void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);

+void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
+
 class BaseProcessor {
 public:
  BaseProcessor() { context_ = nullptr; }
@@ -131,6 +133,28 @@ class GetProcessor : public BaseProcessor {
  RequestGetCallBack response_call_back_ = ProcGetResponse;
 };

+class SendAndRecvProcessor : public BaseProcessor {
+ public:
+  explicit SendAndRecvProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(), stub_g_(ch) {}
+
+  virtual ~SendAndRecvProcessor() {}
+
+  void ProcessImpl() override {
+    if (response_call_back_) {
+      response_call_back_(*var_h_recv_.get(), reply_);
+      var_h_recv_->Finish(true);
+    }
+  }
+
+  void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; }
+
+  ::grpc::ByteBuffer reply_;
+  ::grpc::GenericStub stub_g_;
+  RequestGetCallBack response_call_back_ = ProcGetResponse;
+  VarHandlePtr var_h_recv_;
+};
+
 class BatchBarrierProcessor : public BaseProcessor {
 public:
  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
@@ -231,6 +255,14 @@ class GRPCClient : public RPCClient {
      const framework::Scope& scope, const std::string& var_name,
      int64_t time_out = FLAGS_rpc_deadline) override;

+  VarHandlePtr AsyncSendAndRecv(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& send_var_name,
+                                const std::string& recv_var_name,
+                                const std::string& table_name = "",
+                                int64_t time_out = FLAGS_rpc_deadline) override;
+
  VarHandlePtr AsyncSendComplete(
      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;


--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -76,7 +76,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
    PADDLE_THROW("Serialize does not support type: %s",
                 typeid(var->Type()).name());
  }
-
  std::string header;
  request.AppendToString(&header);
  auto buffer = std::unique_ptr<char[]>(new char[1024]);
@@ -101,7 +100,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
  }
 #endif
  PADDLE_ENFORCE_NOT_NULL(payload);
-
  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
                            payload->memory_size());
  if (payload->memory_size() >= std::numeric_limits<int>::max()) {
@@ -140,7 +138,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
        ::grpc::Slice::STEAL_REF);
    num_slices = 4;
  }
-
  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
  msg->Swap(&tmp);
 }
@@ -156,6 +153,19 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
  *trainer_id = resp.GetTrainerId();
 }

+void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                                   const platform::DeviceContext& ctx,
+                                   const framework::Scope* scope,
+                                   framework::Variable** var, int* trainer_id) {
+  platform::RecordRPCEvent record_event("deserial");
+  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE_EQ(
+      resp.Parse(msg), 0,
+      platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
+  *var = resp.GetRecvVar();
+  *trainer_id = resp.GetTrainerId();
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
@@ -47,6 +47,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const framework::Scope* scope,
                               framework::Variable** var, int* trainer_id);

+void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                                   const platform::DeviceContext& ctx,
+                                   const framework::Scope* scope,
+                                   framework::Variable** var, int* trainer_id);
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -28,6 +28,7 @@ DECLARE_int32(rpc_retry_bind_port);
 namespace paddle {
 namespace operators {
 namespace distributed {
+
 enum CallStatus { PROCESS = 0, FINISH };

 // reference:
@@ -433,6 +434,51 @@ class RequestNotify final : public RequestBase {
  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };

+class RequestSendAndRecv final : public RequestBase {
+ public:
+  explicit RequestSendAndRecv(GrpcService::AsyncService* service,
+                              ::grpc::ServerCompletionQueue* cq,
+                              RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new GRPCVariableResponse(
+        request_handler->scope(), request_handler->dev_ctx(),
+        request_handler->distributed_mode()));
+
+    int method_id =
+        static_cast<int>(distributed::GrpcMethod::kRequestSendAndRecv);
+
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestSendAndRecv() {}
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    std::string in_var_name = request_->Varname();
+    std::string out_var_name = request_->OutVarname();
+    std::string table_name = request_->TableName();
+    int trainer_id = request_->GetTrainerId();
+
+    VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name
+            << " out_var_name: " << out_var_name << " trainer: " << trainer_id;
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = scope->FindVar(in_var_name);
+    framework::Variable* outvar = nullptr;
+    request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
+                             out_var_name, table_name);
+    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
+                          &reply_);
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  std::shared_ptr<GRPCVariableResponse> request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+};
+
 void AsyncGRPCServer::WaitServerReady() {
  VLOG(4) << "AsyncGRPCServer is waiting server ready";
  std::unique_lock<std::mutex> lock(this->mutex_ready_);
@@ -586,6 +632,8 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
    b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id);
  } else if (rpc_name == kRequestNotify) {
    b = new RequestNotify(service_.get(), cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestSendAndRecv) {
+    b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id);
  } else {
    PADDLE_ENFORCE(false, "not supported rpc");
  }

--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h
@@ -85,10 +85,12 @@ enum class GrpcMethod {
  kGetMonomerVariable,
  kGetMonomerBarrier,
  kRequestNotify,
+  kRequestSendAndRecv,
+  // when you add new handler, change kGrpcNumMethods at the same time!
 };

 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kRequestNotify) + 1;
+    static_cast<int>(GrpcMethod::kRequestSendAndRecv) + 1;

 inline const char* GrpcMethodName(GrpcMethod id) {
  switch (id) {
@@ -108,6 +110,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
      return "/sendrecv.SendRecvService/CheckpointNotify";
    case GrpcMethod::kRequestNotify:
      return "/sendrecv.SendRecvService/DistributeNotify";
+    case GrpcMethod::kRequestSendAndRecv:
+      return "/sendrecv.SendRecvService/SendAndRecvVariable";
  }

  // Shouldn't be reached.

--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -46,6 +46,7 @@ constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
 constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
 constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
 constexpr char kRequestNotify[] = "RequestNotify";
+constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv";

 constexpr char kSendRPC[] = "SendRPC";
 constexpr char kGetRPC[] = "GetRPC";
@@ -57,6 +58,7 @@ constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
 constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
 constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
 constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
+constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC";
 constexpr int64_t kPrefetchTimeout = 60000;

 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"

--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -325,6 +325,22 @@ bool RequestNotifyHandler::Handle(const std::string &varname,
  return true;
 }

+bool RequestSendAndRecvHandler::Handle(const std::string &varname,
+                                       framework::Scope *Scope,
+                                       framework::Variable *var,
+                                       framework::Variable **outvar,
+                                       const int trainer_id,
+                                       const std::string &out_var_name,
+                                       const std::string &table_name) {
+  VLOG(3) << "SendAndRecvHandle: " << varname
+          << " out_var_name: " << out_var_name
+          << " , trainer_id:  " << trainer_id;
+
+  executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope);
+  *outvar = Scope->FindVar(out_var_name);
+  return true;
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -176,6 +176,17 @@ class RequestNotifyHandler final : public RequestHandler {
  std::unordered_map<int, int64_t> decay_counters;
 };

+class RequestSendAndRecvHandler final : public RequestHandler {
+ public:
+  explicit RequestSendAndRecvHandler(int distributed_mode)
+      : RequestHandler(distributed_mode) {}
+  virtual ~RequestSendAndRecvHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* Scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
+};
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -85,6 +85,12 @@ class RPCClient {
      const framework::Scope& scope, const std::string& var_name,
      int64_t time_out = FLAGS_rpc_deadline) = 0;

+  virtual VarHandlePtr AsyncSendAndRecv(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& send_var_name,
+      const std::string& recv_var_name, const std::string& table_name = "",
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
  virtual VarHandlePtr AsyncSendComplete(
      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;


--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -35,27 +35,24 @@ namespace platform = paddle::platform;
 namespace distributed = paddle::operators::distributed;

 USE_NO_KERNEL_OP(lookup_sparse_table_read);
+USE_OP(scale);

 std::unique_ptr<distributed::RPCServer> g_rpc_service;
 std::unique_ptr<distributed::RequestHandler> g_req_handler;

-framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
+framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
  auto root_block = program->MutableBlock(0);
  auto* block = program->AppendBlock(*root_block);

-  framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
-  framework::VariableNameMap output({{"Output", {"out"}}});
-  auto op = block->AppendOp();
-  op->SetType("lookup_sparse_table_read");
-  op->SetInput("W", {"w"});
-  op->SetInput("Ids", {"ids"});
-  op->SetOutput("Out", {"out"});
-  op->SetAttr("tablename", {"w"});
-  op->SetAttr("value_names", {"Param"});
-
-  auto& out = *root_block->Var("out");
+  framework::OpDesc* op = block->AppendOp();
+  op->SetType("scale");
+  op->SetInput("X", {"x"});
+  op->SetOutput("Out", {"res"});
+  op->SetAttr("scale", 0.5f);
+
+  auto& out = *root_block->Var("res");
  out.SetType(framework::proto::VarType::LOD_TENSOR);
-  out.SetShape({10, 10});
+  out.SetShape({1, 10});

  return block;
 }
@@ -69,6 +66,12 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {

  auto ids_var = scope->Var("ids");
  ids_var->GetMutable<framework::LoDTensor>();
+
+  auto x_var = scope->Var("x");
+  x_var->GetMutable<framework::LoDTensor>();
+
+  auto res_var = scope->Var("res");
+  res_var->GetMutable<framework::LoDTensor>();
 }

 void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
@@ -78,6 +81,11 @@ void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
  int64_t* ids_ptr =
      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
+
+  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
+  float* x_ptr =
+      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
 }

 void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
@@ -124,6 +132,38 @@ void StartServer(const std::string& rpc_name) {
  server_thread.join();
 }

+void StartSendAndRecvServer(const std::string& rpc_name) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+  auto block = AppendSendAndRecvBlock(&program);
+  std::string in_var_name("x");
+  std::vector<int> prefetch_block_ids{block->ID()};
+  auto prepared = exe.Prepare(program, prefetch_block_ids);
+  InitTensorsOnServer(&scope, &place, 10);
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      grad_to_prepared_ctx;
+  grad_to_prepared_ctx[in_var_name] = prepared[0];
+
+  g_req_handler->SetProgram(&program);
+  g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  g_req_handler->SetDevCtx(&ctx);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetExecutor(&exe);
+
+  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
+
+  std::thread server_thread(
+      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
+
+  server_thread.join();
+}
+
 TEST(COMPLETE, CPU) {
  setenv("http_proxy", "", 1);
  setenv("https_proxy", "", 1);
@@ -147,3 +187,46 @@ TEST(COMPLETE, CPU) {
  g_rpc_service.reset(nullptr);
  g_req_handler.reset(nullptr);
 }
+
+TEST(SENDANDRECV, CPU) {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  g_req_handler.reset(new distributed::RequestSendAndRecvHandler(
+      distributed::DistributedMode::kAsync));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+  PADDLE_ENFORCE_NE(client, nullptr,
+                    platform::errors::InvalidArgument(
+                        "Client Start Fail, Check Your Code & Env"));
+  std::thread server_thread(StartSendAndRecvServer,
+                            distributed::kRequestSendAndRecv);
+  g_rpc_service->WaitServerReady();
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  // create var on local scope
+  int64_t rows_numel = 10;
+  InitTensorsOnClient(&scope, &place, rows_numel);
+  std::string in_var_name("x");
+  std::string out_var_name("res");
+
+  client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name);
+  client->Wait();
+  auto var = scope.Var(out_var_name);
+  auto value = var->GetMutable<framework::LoDTensor>();
+  auto ptr = value->mutable_data<float>(place);
+
+  for (int64_t i = 0; i < rows_numel; ++i) {
+    EXPECT_EQ(ptr[i], 0.5);
+  }
+  g_rpc_service->ShutDown();
+  server_thread.join();
+  LOG(INFO) << "begin reset";
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
+}
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -29,7 +29,7 @@ service SendRecvService {

  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
  rpc DistributeNotify(VariableMessage) returns (VoidMessage) {}
-
+  rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {}
  rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
  rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }

--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -96,6 +96,13 @@ class VariableResponse {
    return scope_->FindVar(meta_.varname());
  }

+  framework::Variable* GetRecvVar() {
+    if (create_scope_) {
+      return local_scope_->Var(meta_.out_varname());
+    }
+    return scope_->FindVar(meta_.out_varname());
+  }
+
  int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }

 protected:

--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -268,7 +268,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
  size_t num_blocks = program->Size();
  PADDLE_ENFORCE_GE(num_blocks, 2,
                    "server program should have at least 2 blocks");
-
  std::vector<int> block_list;
  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
    block_list.push_back(blkid);
@@ -295,6 +294,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
  request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_send_and_recv_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);

  while (true) {
    if (rpc_service_->IsExit()) {
@@ -394,6 +394,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
      new distributed::RequestGetNoBarrierHandler());
  request_notify_handler_.reset(
      new distributed::RequestNotifyHandler(distributed_mode, fan_in));
+  request_send_and_recv_handler_.reset(
+      new distributed::RequestSendAndRecvHandler(distributed_mode));

  rpc_service_->RegisterRPC(distributed::kRequestSend,
                            request_send_handler_.get(), rpc_send_thread_num);
@@ -408,6 +410,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                            request_get_no_barrier_handler_.get());
  rpc_service_->RegisterRPC(distributed::kRequestNotify,
                            request_notify_handler_.get(), rpc_send_thread_num);
+  rpc_service_->RegisterRPC(distributed::kRequestSendAndRecv,
+                            request_send_and_recv_handler_.get(),
+                            rpc_get_thread_num);

  auto optimize_blocks =
      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -416,6 +421,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                        "optimize blocks is less than 1. Optimize blocks "
                        "should be 1 at least on the pserver side."));
  auto *program = optimize_blocks[0]->Program();
+
  framework::Executor executor(dev_place);

  std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
@@ -488,6 +494,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  f(request_checkpoint_handler_.get());
  f(request_get_no_barrier_handler_.get());
  f(request_notify_handler_.get());
+  f(request_send_and_recv_handler_.get());

  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
  signal(SIGINT, SignalHandler::StopAndExit);

--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
@@ -99,6 +99,8 @@ class ListenAndServOp : public framework::OperatorBase {
  mutable std::shared_ptr<distributed::RequestHandler>
      request_checkpoint_handler_;
  mutable std::shared_ptr<distributed::RequestHandler> request_notify_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_send_and_recv_handler_;

  mutable std::shared_ptr<std::thread> server_thread_;
  mutable std::vector<std::string> sparse_vars_;

--- a/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/distributed/communicator.h"
+#include "paddle/fluid/operators/distributed/communicator_common.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/parameter_send.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SendAndRecvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& scope = ctx.scope();
+    const auto& place = ctx.GetPlace();
+    auto send_var_name = ctx.Attr<std::string>("send_var_name");
+    auto recv_var_name = ctx.Attr<std::string>("recv_var_name");
+    auto epmap = ctx.Attr<std::string>("endpoint");
+    auto trainer_id = ctx.Attr<int>("trainer_id");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& context = *pool.Get(place);
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
+    VLOG(3) << "SendAndRecvOp Send_var_name: " << send_var_name
+            << " Recv_var_name: " << recv_var_name;
+    distributed::VarHandlePtr rets = rpc_client->AsyncSendAndRecv(
+        epmap, context, scope, send_var_name, recv_var_name);
+    rets->Wait();
+  }
+};
+
+class SendAndRecvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
+    AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
+    AddAttr<std::string>("send_var_name", "Send Tensor's name")
+        .SetDefault(std::string(""));
+    AddAttr<std::string>("recv_var_name", "Recv Tensor's name")
+        .SetDefault(std::string(""));
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::string>("endpoint", "Server endpoint")
+        .SetDefault({"127.0.0.1:6164"});
+    AddComment(R"DOC(
+    SendAndRecv operator
+    This operator will send variables to listen_and_serve op at the parameter server.
+    And recv variable from parameter server of send variable's scope.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    send_and_recv,
+    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -24,49 +24,69 @@ class AdadeltaOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"),
-                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
-                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Param) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Grad) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("AvgSquaredGrad"), true,
+        platform::errors::InvalidArgument(
+            "Input(AvgSquaredGrad) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("AvgSquaredUpdate"), true,
+        platform::errors::InvalidArgument(
+            "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
        ctx->GetInputsVarType("Param").front() ==
            framework::proto::VarType::LOD_TENSOR,
+        true,
+        platform::errors::InvalidArgument(
            "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
+            ctx->Inputs("Param").front(),
+            ctx->GetInputsVarType("Param").front()));
+    PADDLE_ENFORCE_EQ(
        ctx->GetInputsVarType("Grad").front() ==
            framework::proto::VarType::LOD_TENSOR,
+        true,
+        platform::errors::InvalidArgument(
            "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
+            ctx->Inputs("Grad").front(),
+            ctx->GetInputsVarType("Grad").front()));

-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredGradOut"),
-        "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredUpdateOut"),
-        "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("ParamOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(ParamOut) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("AvgSquaredGradOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(AvgSquaredGradOut) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("AvgSquaredUpdateOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."));

    auto param_dim = ctx->GetInputDim("Param");
    PADDLE_ENFORCE_EQ(
        param_dim, ctx->GetInputDim("Grad"),
        "param and grad input of AdadeltaOp should have same dimension");
-    PADDLE_ENFORCE_NE(framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
+    PADDLE_ENFORCE_NE(
+        framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
+        platform::errors::InvalidArgument(
            "Maybe the Input variable AvgSquaredGrad has not "
            "been initialized. You may need to confirm if you put "
            "exe.run(startup_program) after optimizer.minimize "
-                      "function.");
+            "function."));
    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
+                      platform::errors::InvalidArgument(
                          "Param and AvgSquaredGrad input of AdadeltaOp "
-                      "should have same dimension");
+                          "should have same dimension"));
    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
+                      platform::errors::InvalidArgument(
                          "Param and AvgSquaredUpdate input of AdadeltaOp "
-                      "should have same dimension");
+                          "should have same dimension"));

    ctx->SetOutputDim("ParamOut", param_dim);
    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);

--- a/paddle/fluid/operators/optimizers/adadelta_op.h
+++ b/paddle/fluid/operators/optimizers/adadelta_op.h
@@ -24,17 +24,19 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
                          "The Var(%s)'s type should be LoDTensor, "
                          "but the received is %s",
                          ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+                          framework::ToTypeName(param_var->Type())));
    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
                          "The Var(%s)'s type should be LoDTensor, "
                          "but the received is %s",
                          ctx.InputNames("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
+                          framework::ToTypeName(grad_var->Type())));

    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
    auto avg_squared_grad_out_tensor =

--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -23,22 +23,27 @@ class TopkOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
-                   "Output(Indices) of TopkOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of TopkOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Out) of TopkOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Indices"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Indices) of TopkOp should not be null."));

    auto input_dims = ctx->GetInputDim("X");
    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));

    PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
+    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
+                                                "input must have >= 1d shape"));

    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
-                        "input must have >= k columns");
+      PADDLE_ENFORCE_GE(
+          input_dims[input_dims.size() - 1], k,
+          platform::errors::InvalidArgument("input must have >= k columns"));
    }

    framework::DDim dims = input_dims;

--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -43,8 +43,9 @@ template <typename DeviceContext, typename T>
 class TopkOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("It must use CUDAPlace."));
    auto* input = ctx.Input<Tensor>("X");
    auto* output = ctx.Output<Tensor>("Out");
    auto* indices = ctx.Output<Tensor>("Indices");

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -206,9 +206,9 @@ void BindInferenceApi(py::module *m) {
  BindMkldnnQuantizerConfig(m);
 #endif
  m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<AnalysisConfig>);
+         &paddle::CreatePaddlePredictor<AnalysisConfig>, py::arg("config"));
  m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<NativeConfig>);
+         &paddle::CreatePaddlePredictor<NativeConfig>, py::arg("config"));
  m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
  m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
 }

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1399,6 +1399,9 @@ function main() {
    local CMD=$1 
    local parallel_number=$2
    init
+    if [ "$CMD" != "assert_file_approvals" ];then
+      python ${PADDLE_ROOT}/tools/summary_env.py
+    fi
    case $CMD in
      build_only)
        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}

--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -30,8 +30,11 @@ __all__ = ["spawn"]

 # dygraph parallel apis
 __all__ += [
-    "init_parallel_env", "get_rank", "get_world_size", "prepare_context",
-    "ParallelEnv"
+    "init_parallel_env",
+    "get_rank",
+    "get_world_size",
+    "prepare_context",
+    "ParallelEnv",
 ]

 # collective apis

--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -18,16 +18,15 @@ from .base.distributed_strategy import DistributedStrategy
 from .base.fleet_base import Fleet
 from .base.util_factory import UtilBase
 from .dataset import *
+#from . import metrics

 __all__ = [
    "DistributedStrategy",
    "UtilBase",
    "DatasetFactory",
-    "DatasetBase",
-    "InMemoryDataset",
-    "QueueDataset",
    "UserDefinedRoleMaker",
    "PaddleCloudRoleMaker",
+    "Fleet",
 ]

 fleet = Fleet()

--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -17,6 +17,8 @@ from paddle.distributed.fleet.proto import distributed_strategy_pb2
 from paddle.fluid.framework import Variable, set_flags, core
 import google.protobuf.text_format

+__all__ = ["DistributedStrategy"]
+

 def get_msg_dict(msg):
    res_dict = {}

--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -22,7 +22,7 @@ from .runtime_factory import RuntimeFactory
 from .util_factory import UtilFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator

-__all__ = ['Fleet']
+#__all__ = ['Fleet']


 def _inited_runtime_handler_(func):
@@ -200,7 +200,8 @@ class Fleet(object):
            bool: True if this is a node of server,
                  False if not.
        """
-        return self._role_maker.is_server()
+        return self._role_maker.is_server(
+        ) or self._role_maker._is_heter_worker()

    @property
    def util(self):

--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__all__ = ["MetaOptimizerFactory"]
-
 from ..meta_optimizers import *

 meta_optimizer_names = list(

--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -14,15 +14,17 @@
 """Defination of Role Makers."""
 import os
 import numpy as np
+import warnings
 from multiprocessing import Process, Manager
 import paddle.fluid as fluid

-__all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+#__all__ = ['UserDefinedRoleMaker', 'PaddleCloudRoleMaker']


 class Role:
    WORKER = 1
    SERVER = 2
+    HETER_WORKER = 3


 class RoleMakerBase(object):
@@ -40,6 +42,11 @@ class RoleMakerBase(object):
        self._role = None
        self._current_id = -1

+        # for heter parameter server mode
+        self._heter_trainer_endpoints = []
+        self._heter_trainer_device = "CPU"
+        self._is_heter_parameter_server_mode = False
+
        self._node_type = None
        self._node_type_comm = None
        self._all_comm = None
@@ -163,12 +170,58 @@ class RoleMakerBase(object):
        """
        print("warning: RoleMakerBase does not have barrier worker.")

+    def _is_heter_worker(self):
+        """
+        Return is_heter_worker() of current process
+        """
+        warnings.warn("RoleMakerBase does not have function: _is_heter_worker.")
+        return False
+
+    def _heter_worker_num(self):
+        """
+        Get current total heter-worker number.
+
+        Returns:
+            int: heter_worker number
+        """
+        warnings.warn(
+            "RoleMakerBase does not have function: _heter_worker_num.")
+        return 0
+
+    def _get_heter_worker_endpoints(self):
+        """
+        Returns:
+            string: all heter_trainers'endpoints
+        """
+        assert self._heter_trainer_endpoints != []
+        return self._heter_trainer_endpoints
+
+    def _get_heter_worker_endpoint(self):
+        """
+        Returns:
+            int: corresponding heter_trainer's endpoint
+
+        e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter)
+             then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer
+             and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainerr
+        """
+        assert self._heter_trainer_endpoints != []
+        return self._heter_trainer_endpoints[(self._current_id + 1) %
+                                             self._heter_worker_num()]
+
+    def _get_heter_worker_device(self):
+        """
+        Returns:
+            string: heter_trainer's device of current node, e.g: CPU/GPU/XPU
+        """
+        return self._heter_trainer_device.upper()
+

 class PaddleCloudRoleMaker(RoleMakerBase):
    def __init__(self, is_collective=False, **kwargs):
        super(PaddleCloudRoleMaker, self).__init__()
        self._is_collective = is_collective
-        self._init_gloo = False  #default no init gloo
+        self._init_gloo = False  # default no init gloo
        self._kwargs = kwargs

        self._role_is_generated = False
@@ -278,10 +331,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
        """
        get index of current node
        """
-        if self.is_server():
-            return self.server_index()
-        elif self.is_worker():
-            return self.worker_index()
+        return self._current_id

    def worker_num(self):
        """
@@ -323,6 +373,22 @@ class PaddleCloudRoleMaker(RoleMakerBase):
            self.generate_role()
        return self._server_endpoints

+    def _heter_worker_num(self):
+        """
+        get heter worker nums
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._heter_trainers_num
+
+    def _is_heter_worker(self):
+        """
+        whether current process is heter worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.HETER_WORKER
+
    def _get_rank(self):
        """
        get current rank in all workers and pservers
@@ -342,17 +408,47 @@ class PaddleCloudRoleMaker(RoleMakerBase):
    def _ps_env(self):
        try:
            # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
-            # format: string(ip:port), eg. 127.0.0.1:6001
-            self._server_endpoints = os.environ[
-                "PADDLE_PSERVERS_IP_PORT_LIST"].split(",")
+            # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
+            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST",
+                                               "").split(",")
+            assert self._server_endpoints != ""
            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                               "").split(",")
+            assert self._server_endpoints != ""

            trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
            training_role = os.environ["TRAINING_ROLE"]

-            if training_role not in ["TRAINER", "PSERVER"]:
-                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
+            if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
+                raise ValueError(
+                    "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
+                    format(training_role))
+
+            # For heter parameter server env setting
+            heter_trainer_eplist = os.getenv(
+                "PADDLE_HETER_TRAINER_IP_PORT_LIST", None)
+            heter_trainer_device = os.getenv("PADDLE_HETER_TRAINER_DEVICE",
+                                             None)
+            if heter_trainer_eplist and heter_trainer_device:
+                try:
+                    heter_trainer_eplist = os.environ[
+                        "PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",")
+                except:
+                    raise ValueError(
+                        "Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ."
+                    )
+
+                self._is_heter_parameter_server_mode = True
+                heter_trainers_num = len(heter_trainer_eplist)
+                current_node_device = heter_trainer_device.upper()
+                if current_node_device not in ["CPU", "GPU", "XPU"]:
+                    raise ValueError(
+                        "Heter Trainer doesn't support {} device now, please use CPU / GPU / XPU(KunLun)".
+                        format(heter_trainer_device))
+                self._heter_trainer_device = current_node_device
+            else:
+                self._is_heter_parameter_server_mode = False
+                heter_trainers_num = 0

            if training_role == "TRAINER":
                role = Role.WORKER
@@ -365,17 +461,26 @@ class PaddleCloudRoleMaker(RoleMakerBase):
                ip = os.environ["POD_IP"]
                self._cur_endpoint = ip + ":" + port
                current_id = self._server_endpoints.index(self._cur_endpoint)
+            elif training_role == "HETER_TRAINER":
+                role = Role.HETER_WORKER
+                cur_ip = os.environ["POD_IP"]
+                cur_port = os.environ["PADDLE_PORT"]
+                curr_endpoint = ":".join([cur_ip, cur_port])
+                current_id = heter_trainer_eplist.index(curr_endpoint)
            else:
-                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
-        except ValueError as ve:
                raise ValueError(
-                "something wrong with PaddleCloud, please check environment")
+                    "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER")
+        except ValueError as e:
+            raise ValueError(
+                "Something wrong with PaddleCloud, please check environment")

        self._trainers_num = trainers_num
        self._role = role
        self._current_id = current_id
        self._node_num = len(
            set([x.split(':')[0] for x in self._worker_endpoints]))
+        self._heter_trainers_num = heter_trainers_num
+        self._heter_trainer_endpoints = heter_trainer_eplist

    def _collective_env(self):
        self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))

--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -15,24 +15,10 @@ from .amp_optimizer import AMPOptimizer
 from .recompute_optimizer import RecomputeOptimizer
 from .gradient_merge_optimizer import GradientMergeOptimizer
 from .graph_execution_optimizer import GraphExecutionOptimizer
-from .async_optimizer import AsyncMetaOptimizer
+from .parameter_server_optimizer import ParameterServerOptimizer
 from .pipeline_optimizer import PipelineOptimizer
 from .localsgd_optimizer import LocalSGDOptimizer
 from .lars_optimizer import LarsOptimizer
-from .async_graph_execution_optimizer import AsyncGraphExecutionOptimizer
+from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
 from .dgc_optimizer import DGCOptimizer
 from .lamb_optimizer import LambOptimizer
-
-__all__ = [
-    'AMPOptimizer',
-    'RecomputeOptimizer',
-    'GradientMergeOptimizer',
-    'AsyncMetaOptimizer',
-    'GraphExecutionOptimizer',
-    'PipelineOptimizer',
-    'LocalSGDOptimizer',
-    'LarsOptimizer',
-    'AsyncGraphExecutionOptimizer',
-    'DGCOptimizer',
-    'LambOptimizer',
-]
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -14,8 +14,6 @@
 import paddle.fluid.contrib.mixed_precision as mixed_precision
 from .meta_optimizer_base import MetaOptimizerBase

-__all__ = ["AMPOptimizer"]
-

 class AMPOptimizer(MetaOptimizerBase):
    def __init__(self, optimizer):

--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -15,8 +15,6 @@ from paddle.fluid.optimizer import Momentum, DGCMomentumOptimizer
 from .meta_optimizer_base import MetaOptimizerBase
 import logging

-__all__ = ["DGCOptimizer"]
-

 class DGCOptimizer(MetaOptimizerBase):
    def __init__(self, optimizer):

--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -14,10 +14,6 @@
 from paddle.fluid.optimizer import GradientMergeOptimizer as GM
 from .meta_optimizer_base import MetaOptimizerBase

-__all__ = ["GradientMergeOptimizer"]
-
-# amp + gradient merge + lamb
-

 class GradientMergeOptimizer(MetaOptimizerBase):
    def __init__(self, optimizer):

--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -16,8 +16,6 @@ from paddle.fluid.optimizer import LambOptimizer as LAMB
 from .meta_optimizer_base import MetaOptimizerBase
 import logging

-__all__ = ["LambOptimizer"]
-

 class LambOptimizer(MetaOptimizerBase):
    def __init__(self, optimizer):

--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -15,8 +15,6 @@ from paddle.fluid.optimizer import Momentum, LarsMomentumOptimizer
 from .meta_optimizer_base import MetaOptimizerBase
 import logging

-__all__ = ["LarsOptimizer"]
-

 class LarsOptimizer(MetaOptimizerBase):
    def __init__(self, optimizer):

--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__all__ = ["MetaOptimizerBase"]
-
 from paddle.fluid.optimizer import Optimizer



--- a/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
@@ -13,12 +13,12 @@

 from paddle import fluid
 from paddle.fluid import compiler
-from .async_optimizer import AsyncMetaOptimizer
+from .parameter_server_optimizer import ParameterServerOptimizer


-class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer):
+class ParameterServerGraphOptimizer(ParameterServerOptimizer):
    def __init__(self, optimizer):
-        super(AsyncGraphExecutionOptimizer, self).__init__(optimizer)
+        super(ParameterServerGraphOptimizer, self).__init__(optimizer)
        self.inner_opt = optimizer
        # we do not allow meta optimizer to be inner optimizer currently
        self.meta_optimizers_white_list = []
@@ -31,6 +31,9 @@ class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer):
        if self.role_maker.is_server():
            return False

+        if self.role_maker._is_heter_parameter_server_mode:
+            return False
+
        return True

    def _disable_strategy(self, dist_strategy):

--- a/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
@@ -15,9 +15,9 @@ from paddle import fluid
 from .meta_optimizer_base import MetaOptimizerBase


-class AsyncMetaOptimizer(MetaOptimizerBase):
+class ParameterServerOptimizer(MetaOptimizerBase):
    def __init__(self, optimizer):
-        super(AsyncMetaOptimizer, self).__init__(optimizer)
+        super(ParameterServerOptimizer, self).__init__(optimizer)
        self.inner_opt = optimizer
        # we do not allow meta optimizer to be inner optimizer currently
        self.meta_optimizers_white_list = []
@@ -68,6 +68,21 @@ class AsyncMetaOptimizer(MetaOptimizerBase):
            _startup = worker.init_from_server_pass(_startup, compiled_config)
            _startup = worker.delet_extra_optimizes_pass(_startup,
                                                         compiled_config)
+
+            # for heter program
+            if self.role_maker._is_heter_parameter_server_mode:
+                from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
+                if self.role_maker._is_heter_worker():
+                    # for heter worker
+                    _main = heter_worker.split_heter_worker_ops_pass(
+                        _main, compiled_config)
+                else:
+                    # for default worker
+                    _main = heter_worker.split_trainer_ops_pass(_main,
+                                                                compiled_config)
+                # for startup change
+                _startup = heter_worker.delete_startup_useless_ops_var_pass(
+                    _startup, _main, compiled_config)
        else:
            _main = worker.append_send_ops_pass(_main, compiled_config)
            _startup = _startup
@@ -129,9 +144,12 @@ class AsyncMetaOptimizer(MetaOptimizerBase):
                                                     _origin_startup_program,
                                                     strategy, self.role_maker)

-        main_program, startup_program = \
-            self._build_trainer_programs(compiled_config) if self.role_maker.is_worker() \
-                else self._build_pserver_programs(compiled_config)
+        if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
+            main_program, startup_program = self._build_trainer_programs(
+                compiled_config)
+        elif self.role_maker.is_server():
+            main_program, startup_program = self._build_pserver_programs(
+                compiled_config)

        loss.block.program = main_program
        fluid.framework.switch_startup_program(startup_program)

--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -20,8 +20,6 @@ from paddle.fluid.optimizer import PipelineOptimizer as PO
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op

-__all__ = ["PipelineOptimizer"]
-

 class PipelineHelper(CollectiveHelper):
    def __init__(self, role_maker, nrings=1, wait_port='6174'):

--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -14,8 +14,6 @@
 from paddle.fluid.optimizer import RecomputeOptimizer as RO
 from .meta_optimizer_base import MetaOptimizerBase

-__all__ = ["RecomputeOptimizer"]
-

 class RecomputeOptimizer(MetaOptimizerBase):
    def __init__(self, optimizer):

--- a/python/paddle/distributed/fleet/metrics/__init__.py
+++ b/python/paddle/distributed/fleet/metrics/__init__.py
@@ -11,3 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .metric import *
+
+__all__ = [
+    "sum",
+    "max",
+    "min",
+    "auc",
+    "mae",
+    "rmse",
+    "mse",
+    "acc",
+]
--- a/python/paddle/distributed/fleet/runtime/__init__.py
+++ b/python/paddle/distributed/fleet/runtime/__init__.py
@@ -14,5 +14,3 @@

 from .collective_runtime import CollectiveRuntime
 from .parameter_server_runtime import ParameterServerRuntime
-
-__all__ = ["CollectiveRuntime," "ParameterServerRuntime", ]
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -196,6 +196,18 @@ class ParameterServerRuntime(RuntimeBase):
        else:
            warnings.warn("communicator has been initialized, skip")

+    def _get_executor(self):
+        if self.role_maker._is_heter_worker():
+            if self.role_maker._get_heter_worker_device() == "GPU":
+                gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+                executor = Executor(fluid.CUDAPlace(gpu_id))
+            else:
+                raise ValueError("Not Support Device {}".format(
+                    self.role_maker._get_heter_worker_device()))
+        else:
+            executor = fluid.Executor(fluid.CPUPlace())
+        return executor
+
    def _init_server(self, *args, **kwargs):
        if len(args) > 1:
            raise ValueError("init server can only accept 1 args: `dirname`")
@@ -204,9 +216,15 @@ class ParameterServerRuntime(RuntimeBase):
        else:
            model_dirname = None

-        executor = fluid.Executor(fluid.CPUPlace())
+        if self.role_maker._is_heter_worker():
+            self._init_worker()
+
+        executor = self._get_executor()
        executor.run(fluid.default_startup_program())

+        if self.role_maker._is_heter_worker():
+            return
+
        if not model_dirname:
            return

@@ -237,12 +255,12 @@ class ParameterServerRuntime(RuntimeBase):
        # self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames)

    def _run_server(self):
-        executor = fluid.Executor(fluid.CPUPlace())
+        executor = self._get_executor()
        executor.run(fluid.default_main_program())

    def _stop_worker(self):
        self._communicator.stop()
-        executor = fluid.Executor(fluid.CPUPlace())
+        executor = self._get_executor()
        executor.close()

    def _get_optimizer_status(self, op, param_name):

--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -15,4 +15,4 @@
 from .fs import *
 from .http_server import KVHandler, KVHTTPServer, KVServer

-__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
+#__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@@ -145,7 +145,7 @@ class Fleet(object):

        Returns:
            bool: True if this is a node of server,
-                  False if not.
+                  False if not
        """
        return self._role_maker.is_server()


--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -343,7 +343,6 @@ class MPISymetricRoleMaker(MPIRoleMaker):
    def get_pserver_endpoints(self):
        """
        get pserver endpoints
-        
        Returns:
            endpoints(list): pserver endpoints
        """

--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import warnings
+
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+
+from paddle.fluid.transpiler.details.program_utils import delete_ops
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import find_heter_ops
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import create_heter_program
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import create_trainer_program
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import find_block_joints
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import find_op_input_output
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import get_vars_name_in_block
+
+
+def split_heter_worker_ops_pass(program, config):
+    """
+    split heter worker program from origin-program
+    1. find heter op (located on different device)
+    2. find input&output of every heter-block
+    3. create heter worker program, add listen&serv op
+    """
+    default_deveice = "cpu"
+    program, heter_ops, _, program_block_ops = find_heter_ops(program,
+                                                              default_deveice)
+    if len(heter_ops) == 0:
+        warnings.warn(
+            "Currently running in Heter Parameter Server mode, but no OP running on heterogeneous devices, Please check your code."
+        )
+        return program
+
+    current_device = "gpu"
+    if current_device not in heter_ops:
+        raise ValueError("Op which run on device {} not exist.".format(
+            current_device))
+
+    block_vars_detail = find_block_joints(program, program_block_ops, heter_ops)
+    heter_program = framework.Program()
+    create_heter_program(program, config, heter_program, heter_ops,
+                         block_vars_detail, current_device)
+    return heter_program
+
+
+def split_trainer_ops_pass(program, config):
+    """
+    split cpu-trainer program from origin-program
+    1. find heter op (located on different device)
+    2. find input&output of every heter-block
+    3. create cpu-trainer program, add send&recv op 
+    """
+    # Todo: support user define default_device (MrChengmo)
+    default_deveice = "cpu"
+    program, heter_ops, _, program_block_ops = find_heter_ops(program,
+                                                              default_deveice)
+    block_vars_detail = find_block_joints(program, program_block_ops, heter_ops)
+    create_trainer_program(program, config, heter_ops, block_vars_detail)
+    return program
+
+
+def delete_startup_useless_ops_var_pass(startup_program, main_program, config):
+    """
+    delete variable which not used in current main_program
+    """
+    # find all op and its var
+    vars_in_main_program = get_vars_name_in_block(main_program.global_block())
+
+    block_nums = startup_program.num_blocks
+    for block_index in range(1, block_nums):
+        current_block = startup_program.block(block_index)
+        # delete useless op
+        need_delete_op = []
+        for op in current_block.ops:
+            inputs, outputs = find_op_input_output(startup_program,
+                                                   current_block, op)
+            inputs += outputs
+            # Todo: delete some concat op
+            if list(set(inputs) & set(vars_in_main_program)) == None:
+                need_delete_op.append(op)
+        delete_ops(current_block, need_delete_op)
+
+        # delete useless var
+        for var in current_block.vars:
+            if var.name not in vars_in_main_program:
+                startup_program._remove_var(var.name)
+
+    return startup_program
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -12,33 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Copyright(c) 2020 PaddlePaddle Authors.All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0(the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http:  // www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from __future__ import print_function
 from functools import reduce

 import collections
 import math
 import os
+import warnings

 import six
+import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.core import CommContext
+import paddle.fluid.framework as framework
 from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
 from paddle.fluid.incubate.fleet.parameter_server.ir import vars_metatools
 from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import RoundRobin, PSDispatcher
+from paddle.fluid.transpiler.details.program_utils import delete_ops

 OP_NAME_SCOPE = "op_namescope"
 CLIP_OP_NAME_SCOPE = "@CLIP"
@@ -122,9 +112,20 @@ class MergedVariable:
        self.offsets = offsets


+def Singleton(cls):
+    _instance = {}
+
+    def _singleton(*args, **kargs):
+        if cls not in _instance:
+            _instance[cls] = cls(*args, **kargs)
+        return _instance[cls]
+
+    return _singleton
+
+
+@Singleton
 class CompileTimeStrategy(object):
    def __init__(self, main_program, startup_program, strategy, role_maker):
-
        self.min_block_size = 8192

        self.origin_main_program = main_program
@@ -177,6 +178,12 @@ class CompileTimeStrategy(object):
    def get_ps_endpoints(self):
        return self.role_maker.get_pserver_endpoints()

+    def get_heter_worker_endpoints(self):
+        return self.role_maker._get_heter_worker_endpoints()
+
+    def get_heter_worker_endpoint(self):
+        return self.role_maker._get_heter_worker_endpoint()
+
    def get_origin_programs(self):
        return self.origin_main_program, self.origin_startup_program

@@ -810,6 +817,30 @@ class CompileTimeStrategy(object):

        return sparse_param_grads, dense_param_grads

+    def remove_var_pair_by_grad(self, var_name):
+
+        for index, pair in enumerate(self.merged_variables_pairs):
+            var = pair[0]
+            var_grad = pair[1]
+            if var_grad.merged_var.name == var_name:
+                del self.merged_variables_pairs[index]
+
+        for index, pair in enumerate(self.merged_dense_pairs):
+            var = pair[0]
+            var_grad = pair[1]
+            if var_grad.merged_var.name == var_name:
+                del self.merged_dense_pairs[index]
+                return
+
+        for index, pair in enumerate(self.merged_sparse_pairs):
+            var = pair[0]
+            var_grad = pair[1]
+            if var_grad.merged_var.name == var_name:
+                del self.merged_sparse_pairs[index]
+                return
+
+        print("Not find {} in self.merge_pairs".format(var_name))
+

 def _is_opt_role_op(op):
    # NOTE : depend on oprole to find out whether this op is for

--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -13,7 +13,13 @@
 # limitations under the License.

 from __future__ import print_function
+import six
+import collections
+import warnings
+import math

+from functools import reduce
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework

@@ -34,6 +40,10 @@ LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
 OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
 op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()

+DEVICE_LIST = ["cpu", "gpu", "xpu"]
+COMMUNICATE_OPS_TYPE = ["send", "recv", "fetch_barrier", "send_barrier"]
+DEFAULT_DEVICE = 'cpu'
+

 def delete_optimizer_pass(program, config):
    def _delete_optimizer_op_and_vars(_program, optimize_ops):
@@ -250,7 +260,7 @@ def fake_init_ops_pass(program, config):
        return list(set(dist_varnames + sparse_varnames))

    def _fake_init_sparsetable(sparse_table_names):
-        #delete table init op
+        # delete table init op
        for table_name in sparse_table_names:
            table_var = program.global_block().vars[table_name]
            table_param_init_op = []
@@ -307,3 +317,871 @@ def delet_extra_optimizes_pass(program, config):
            program.global_block()._remove_var(var)

    return program
+
+
+def find_heter_ops(program, default_device="cpu"):
+    if default_device not in DEVICE_LIST:
+        raise ValueError("Given device {} is not in device list {}".format(
+            default_device, DEVICE_LIST))
+
+    def _is_heter_op(op, current_heter_device, default_device="cpu"):
+        heter_devices = list(DEVICE_LIST)
+        heter_devices.remove(default_device)
+        op_device = op.attr("op_device")
+        op_type = op.type
+        if op_device in heter_devices:
+            return True
+        elif op_type in COMMUNICATE_OPS_TYPE and current_heter_device != default_device:
+            # for distributed communciate ops: send & recv & barrier etc.
+            # Todo: need update this method
+            op._set_attr('op_device', current_heter_device)
+            return True
+        elif op_device == None or op_device == default_device:
+            op._set_attr('op_device', default_device)
+            return False
+        return False
+
+    def _is_same_device(op, pre_device, default_device="cpu"):
+        op_device = op.attr("op_device")
+        if op_device == pre_device:
+            return True
+        if pre_device == default_device:
+            return True
+        return False
+
+    def _append_heter_op(op, current_heter_block_ops, heter_ops):
+        op_device = op.attr("op_device")
+        if op_device not in heter_ops:
+            heter_ops[op_device] = {}
+        current_heter_block_ops.append(op)
+
+    origin_porgram = program.clone()
+    block = program.global_block()
+
+    program_block_ops = []
+    default_ops = {default_device: {}}
+    heter_ops = {}
+    block_index = 0
+    # heter_ops: {"gpu": {1:[op1, op2, ...], 2:[op1, op2, ...] }; "xpu": {3:[op1, op2, ...], 4:[op1, op2, ...] }}
+
+    current_heter_block_ops = []
+    current_default_block_ops = []
+    current_heter_device = default_device
+    is_heter = False
+    for op in block.ops:
+        if _is_heter_op(op, current_heter_device, default_device):
+            # for gpu/xpu-op
+            is_heter = True
+
+            # for cpu-op block append
+            if len(current_default_block_ops) > 1:
+                default_ops[default_device][
+                    block_index] = current_default_block_ops
+                program_block_ops.append(current_default_block_ops)
+                current_default_block_ops = []
+                block_index += 1
+
+            if _is_same_device(op, current_heter_device, default_device):
+                # for gpu-op, gpu-op -> gpu-op,...
+                current_heter_device = op.attr("op_device")
+                _append_heter_op(op, current_heter_block_ops, heter_ops)
+            else:
+                # for gpu-op -> xpu-op, ...
+                op_device = current_heter_block_ops[0].attr("op_device")
+                heter_ops[op_device][block_index] = current_heter_block_ops
+                program_block_ops.append(current_heter_block_ops)
+                block_index += 1
+                current_heter_block_ops = []
+                current_heter_device = op.attr("op_device")
+                _append_heter_op(op, current_heter_block_ops, heter_ops)
+
+        elif is_heter:
+            # for gpu/xpu-op -> cpu-op
+            op_device = current_heter_block_ops[0].attr("op_device")
+            heter_ops[op_device][block_index] = current_heter_block_ops
+            program_block_ops.append(current_heter_block_ops)
+            block_index += 1
+            current_heter_block_ops = []
+            current_heter_device = default_device
+            is_heter = False
+            current_default_block_ops.append(op)
+        else:
+            # for cpu-op
+            current_default_block_ops.append(op)
+
+    if current_default_block_ops != []:
+        default_ops[default_device][block_index] = current_default_block_ops
+        program_block_ops.append(current_default_block_ops)
+
+    if current_heter_block_ops != []:
+        op_device = current_heter_block_ops[0].attr("op_device")
+        heter_ops[op_device][block_index] = current_heter_block_ops
+        program_block_ops.append(current_heter_block_ops)
+
+    if len(heter_ops) == 0:
+        warnings.warn(
+            "No heterogeneous OP was found in your program , "
+            " please using fluid.device_guard() to run OPs on different device.")
+
+    total_heter_ops = 0
+    heter_blocks = 0
+    for device in heter_ops.keys():
+        heter_block_dict = heter_ops[device]
+        heter_blocks += len(heter_block_dict)
+        for _, heter_block in heter_block_dict.items():
+            total_heter_ops += len(heter_block)
+    print(
+        "There are {} OPs in your main_program, and contains {} heter-OPs which is made up of {} heter-blocks.".
+        format(len(block.ops), total_heter_ops, heter_blocks))
+    return origin_porgram, heter_ops, default_ops, program_block_ops
+
+
+def create_heter_program(program, config, heter_program, heter_ops,
+                         block_var_detail, current_device):
+    # add heter op
+    optimizer_block = []
+    grad_to_block_id = []
+    send_grad_var_list = []
+
+    pre_block_idx = heter_program.num_blocks - 1
+    for index, heter_block_ops in heter_ops[current_device].items():
+        heter_block = heter_program._create_block(pre_block_idx)
+        optimizer_block.append(heter_block)
+        for _, op in enumerate(heter_block_ops):
+            block_append_op(heter_program, program, heter_block, op)
+
+            # add relate variables
+            inputs = _get_input_map_from_op(program.global_block().vars, op)
+            add_vars_by_op_map(inputs, heter_program)
+
+            outputs = _get_output_map_from_op(program.global_block().vars, op)
+            add_vars_by_op_map(outputs, heter_program)
+
+        entrance_vars = block_var_detail[index]["entrance"]
+        add_vars_by_var_list(entrance_vars, program, heter_program)
+        exit_vars = block_var_detail[index]["exit"]
+        add_vars_by_var_list(exit_vars, program, heter_program)
+
+        comm_info = get_communicate_var_info(program, index, entrance_vars,
+                                             exit_vars)
+
+        grad_to_block_id.append(comm_info["block_input_var_name"] + ":" + str(
+            heter_block.idx))
+
+        # create slice op
+        first_op_index = 0
+
+        get_type_var_name = comm_info["input_var_reshape_name"][0].split(
+            ".input_reshape@Heter")[0]
+        get_type_var = heter_program.global_block().vars[get_type_var_name]
+
+        insert_recv_slice_op(
+            heter_program, heter_block, first_op_index,
+            comm_info["block_input_var_name"],
+            (-1, sum(comm_info["input_var_reshape_dim"])), get_type_var.dtype,
+            get_type_var.type, comm_info["input_var_reshape_name"], [
+                (-1, comm_info["input_var_reshape_dim"][i])
+                for i in range(len(comm_info["input_var_reshape_dim"]))
+            ])
+        first_op_index += len(comm_info["input_var_reshape_dim"])
+        # create reshape op
+        for i in range(len(comm_info["input_var_reshape_name"])):
+            var_name = entrance_vars[i]
+            insert_reshape_op(
+                heter_program,
+                heter_block,
+                first_op_index,
+                comm_info["input_var_reshape_name"][i],
+                var_name, )
+            first_op_index += 1
+
+        first_op_index = len(heter_block.ops)
+
+        # create send reshape op
+        for i in range(len(exit_vars)):
+            insert_reshape_op(heter_program, heter_block, first_op_index,
+                              exit_vars[i],
+                              comm_info["output_var_reshape_name"][i],
+                              [-1, comm_info["output_var_reshape_dim"][i]])
+            first_op_index += 1
+
+        # create send concat op
+        insert_send_concat_op(heter_program, heter_block, first_op_index,
+                              comm_info["output_var_reshape_name"],
+                              comm_info["block_output_var_name"],
+                              [-1, sum(comm_info["output_var_reshape_dim"])])
+        check_op_device(heter_block, current_device)
+        send_grad_var_list = send_grad_var_list + add_heter_send_op(
+            program, heter_program, heter_block, block_var_detail[index])
+
+    # add step conter
+    send_input_vars = []
+    dummy_output = []
+    trainer_id = config.get_role_id()
+    pserver_endpoints = config.get_ps_endpoints()
+    optimizer_block[-1].append_op(
+        type="send",
+        inputs={"X": send_input_vars},
+        outputs={"Out": dummy_output},
+        attrs={
+            "send_varnames": [STEP_COUNTER],
+            "merge_add": True,
+            "use_send_handler": False,
+            "endpoints": pserver_endpoints
+        })
+
+    # add info in listen&serv
+    attrs = {
+        "grad_to_block_id": grad_to_block_id,
+        "sparse_grad_to_param": None,
+        "lr_decay_block_id": None,
+        "dense_optimize_blocks": None,
+        "sparse_optimize_blocks": None,
+        "optimize_blocks": optimizer_block,
+
+        # runtime attribute
+        "endpoint": config.get_heter_worker_endpoint(),
+        "pserver_id": config.get_role_id(),
+        "Fanin": config.get_trainers(),
+        "distributed_mode": config.get_distributed_mode(),
+        "rpc_get_thread_num": 12,
+        "rpc_send_thread_num": 12,
+        "rpc_prefetch_thread_num": 12
+    }
+
+    # append the listen_and_serv op
+    heter_program.global_block().append_op(
+        type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs)
+
+    check_heter_compile_time_strategy(program, config, send_grad_var_list)
+
+
+def check_heter_compile_time_strategy(program, config, send_grad_var_list):
+    origin_grad_var_list = []
+    for _, var_grad in config.merged_variables_pairs:
+        origin_grad_var_list.append(var_grad.merged_var.name)
+
+    origin_grad_var_list = list(set(origin_grad_var_list))
+    send_grad_var_list = list(set(send_grad_var_list))
+    useless_grad_var_list = list(
+        set(origin_grad_var_list) - set(send_grad_var_list))
+
+    for useless_grad_var in useless_grad_var_list:
+        config.remove_var_pair_by_grad(useless_grad_var)
+
+
+def create_trainer_program(program, config, heter_ops, block_var_detail):
+    for device in heter_ops.keys():
+        for heter_block_index in sorted(heter_ops[device]):
+            replace_ops_by_communicate_op(program, config, heter_block_index,
+                                          heter_ops[device][heter_block_index],
+                                          block_var_detail)
+            remove_trainer_send_op(program, config, heter_block_index,
+                                   block_var_detail)
+    deleter_trainer_useless_var(program)
+    check_op_device(program.global_block(), DEFAULT_DEVICE)
+
+
+def replace_ops_by_communicate_op(program, config, heter_block_index, ops_list,
+                                  block_var_detail):
+    all_op = program.global_block().ops
+    start_op = ops_list[0]
+    first_op_idx = -1
+    for op in all_op:
+        if is_same_op(op, start_op):
+            first_op_idx = all_op.index(op)
+            break
+    assert first_op_idx != -1
+    delete_same_ops(program.global_block(), ops_list)
+
+    mode = config.get_distributed_mode()
+    heter_worker_endpoint = config.get_heter_worker_endpoint()
+    entrance_var = block_var_detail[heter_block_index]["entrance"]
+    exit_var = block_var_detail[heter_block_index]["exit"]
+
+    default_device_comm_info = get_communicate_var_info(
+        program, heter_block_index - 1,
+        block_var_detail[heter_block_index - 1]["entrance"],
+        block_var_detail[heter_block_index - 1]["exit"])
+    comm_info = get_communicate_var_info(program, heter_block_index,
+                                         entrance_var, exit_var)
+
+    # create reshape op
+    for i in range(len(entrance_var)):
+        insert_reshape_op(
+            program,
+            program.global_block(), first_op_idx, entrance_var[i],
+            default_device_comm_info["output_var_reshape_name"][i],
+            [-1, default_device_comm_info["output_var_reshape_dim"][i]])
+        first_op_idx += 1
+
+    # create concat op
+    insert_send_concat_op(
+        program,
+        program.global_block(), first_op_idx,
+        default_device_comm_info["output_var_reshape_name"],
+        default_device_comm_info["block_output_var_name"],
+        [-1, sum(default_device_comm_info["output_var_reshape_dim"])])
+    first_op_idx += 1
+
+    # create send op
+    send_input_vars = [
+        program.global_block().vars[default_device_comm_info[
+            "block_output_var_name"]]
+    ]
+
+    get_type_var_name = comm_info["output_var_reshape_name"][0].split(
+        ".output_reshape@Heter")[0]
+    get_type_var = program.global_block().vars[get_type_var_name]
+
+    program.global_block().create_var(
+        name=comm_info["block_output_var_name"],
+        shape=(-1, sum(comm_info["output_var_reshape_dim"])),
+        dtype=get_type_var.dtype,
+        type=get_type_var.type)
+
+    recv_vars = [
+        program.global_block().vars[comm_info["block_output_var_name"]]
+    ]
+
+    program.global_block()._insert_op(
+        index=first_op_idx,
+        type="send_and_recv",
+        inputs={"X": send_input_vars},
+        outputs={"Out": recv_vars},
+        attrs={
+            "send_var_name": default_device_comm_info["block_output_var_name"],
+            "recv_var_name": comm_info["block_output_var_name"],
+            "endpoint": heter_worker_endpoint,
+            "trainer_id": config.get_role_id(),
+            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+        })
+    first_op_idx += 1
+
+    # recv
+    # create slice op
+    insert_recv_slice_op(
+        program,
+        program.global_block(), first_op_idx,
+        comm_info["block_output_var_name"],
+        (-1, sum(comm_info["output_var_reshape_dim"])), get_type_var.dtype,
+        get_type_var.type, comm_info["output_var_reshape_name"], [
+            (-1, comm_info["output_var_reshape_dim"][i])
+            for i in range(len(comm_info["output_var_reshape_dim"]))
+        ])
+
+    first_op_idx += len(comm_info["output_var_reshape_dim"])
+
+    # create reshape op
+    for i in range(len(comm_info["output_var_reshape_name"])):
+        var_name = comm_info["output_var_reshape_name"][i].split(
+            ".output_reshape@Heter")[0]
+        insert_reshape_op(
+            program,
+            program.global_block(),
+            first_op_idx,
+            comm_info["output_var_reshape_name"][i],
+            var_name, )
+        first_op_idx += 1
+
+
+def remove_trainer_send_op(program, config, heter_block_index,
+                           block_var_detaile):
+    # if trainer do FF->BP->SEND, it has follow vars: var, var@GRAD
+    # if trainer only do SEND, it has one var: var@GRAD
+    # Delete Send op ,if trainer doesn't has pair var (var<->var@GRAD)
+    persistables = block_var_detaile[heter_block_index]["persistables"]
+    need_remove_send_op = []
+    need_remove_grad_var = []
+    for op in find_send_op(program):
+        input_list, _ = find_op_input_output(program,
+                                             program.global_block(), op)
+        for var_name in input_list:
+            origin_var_name = var_name.split("@GRAD")[0]
+            if origin_var_name in persistables:
+                need_remove_send_op.append(op)
+                need_remove_grad_var.append(var_name)
+    need_remove_send_op = list(set(need_remove_send_op))
+    delete_ops(program.global_block(), need_remove_send_op)
+    for grad_var_name in need_remove_grad_var:
+        config.remove_var_pair_by_grad(grad_var_name)
+
+
+def add_heter_send_op(program, heter_program, block, block_var_detail):
+    def _get_send_op_dict():
+        send_op_dict = {}
+        send_op_list = find_send_op(program)
+        for op in send_op_list:
+            input_list, _ = find_op_input_output(program,
+                                                 program.global_block(), op)
+            for var in input_list:
+                send_op_dict[var] = op
+        return send_op_dict
+
+    send_grad_var_list = []
+    send_op_dict = _get_send_op_dict()
+    for persistable_var in block_var_detail["persistables"]:
+        # check var_name ==  var@GRAD
+        if "@GRAD" not in persistable_var:
+            continue
+        if "GRAD" != persistable_var.split("@")[-1]:
+            continue
+        if persistable_var not in send_op_dict:
+            continue
+        block_append_op(program, heter_program, block,
+                        send_op_dict[persistable_var])
+        send_grad_var_list.append(persistable_var)
+    return send_grad_var_list
+
+
+def find_send_op(program):
+    send_op_list = []
+    for op in program.global_block().ops:
+        if op.type == "send":
+            send_op_list.append(op)
+    return send_op_list
+
+
+def get_communicate_var_info(program, block_index, entrance_var_list,
+                             exit_var_list):
+    input_var_reshape_dim = []
+    input_var_reshape_name = []
+    block_input_var_name = "joint_{}_{}@Heter".format(block_index - 1,
+                                                      block_index)
+    output_var_reshape_dim = []
+    output_var_reshape_name = []
+    block_output_var_name = "joint_{}_{}@Heter".format(block_index,
+                                                       block_index + 1)
+    entrance_var_list.sort()
+    exit_var_list.sort()
+    # input
+    # Heter_SERVER_BLOCK_index@JOINT_VAR -> slice -> var@Heter_SERVER_BLOCK@INPUT_RESHAPE_VAR -> reshape -> var
+    for name in entrance_var_list:
+        var = program.global_block().vars[name]
+        shape = var.shape
+        if len(shape) < 2 or shape[0] != -1:
+            raise ValueError(
+                "Variable {} not support heter training. its shape is {}".
+                format(name, shape))
+        recv_var_dim = -1 * reduce(lambda x, y: x * y, shape)
+        input_var_reshape_dim.append(recv_var_dim)
+        input_var_reshape_name.append("{}.input_reshape@Heter".format(name))
+
+    # output
+    # var -> reshape -> var@Heter_SERVER_BLOCK@INPUT_RESHAPE_VAR -> concat -> Heter_SERVER_BLOCK_index@JOINT_VAR
+    for var_name in exit_var_list:
+        var = program.global_block().vars[var_name]
+        shape = var.shape
+        if len(shape) < 2 or shape[0] != -1:
+            raise ValueError(
+                "Variable {} not support heter training. its shape is {}".
+                format(var_name, shape))
+        send_reshape_dim = -1 * reduce(lambda x, y: x * y, shape)
+        output_var_reshape_dim.append(send_reshape_dim)
+        output_var_reshape_name.append("{}.output_reshape@Heter".format(
+            var_name))
+
+    info = {
+        "input_var_reshape_dim": input_var_reshape_dim,
+        "input_var_reshape_name": input_var_reshape_name,
+        "block_input_var_name": block_input_var_name,
+        "output_var_reshape_dim": output_var_reshape_dim,
+        "output_var_reshape_name": output_var_reshape_name,
+        "block_output_var_name": block_output_var_name
+    }
+
+    return info
+
+
+def find_block_joints(program, program_block_ops_list, heter_ops):
+    block_var_detail = find_entrance_exit_private(program,
+                                                  program_block_ops_list)
+    block_var_detail = entrance_exit_check(program, program_block_ops_list,
+                                           block_var_detail, heter_ops)
+    block_var_detail = delete_block_useless_exit(
+        program, program_block_ops_list, block_var_detail)
+    return block_var_detail
+
+
+def find_entrance_exit_private(program, program_block_ops_list):
+    block_var_detail = []
+    persistables = []
+    for index, block_op_list in enumerate(program_block_ops_list):
+        block_input, block_output = find_ops_list_input_output(program,
+                                                               block_op_list)
+        persistables = screen_persistables(
+            program, block_input) + screen_persistables(program, block_output)
+        # find entrance & exit
+        block_private_vars = list(set(block_input) & set(block_output))
+        block_entrance = list(set(block_input) - set(block_private_vars))
+        block_exit = list(set(block_output) - set(block_private_vars))
+        detail = {
+            "entrance": block_entrance,
+            "exit": block_exit,
+            "private": block_private_vars,
+            "persistables": persistables
+        }
+        block_var_detail.append(detail)
+    return block_var_detail
+
+
+def entrance_exit_check(program, program_block_ops_list, block_var_detail,
+                        heter_ops):
+    for index in range(len(block_var_detail) - 1, -1, -1):
+        if index - 1 < 0:
+            break
+        previous_block_exit = block_var_detail[index - 1]["exit"]
+        previous_block_exit.sort()
+        current_block_entrance = block_var_detail[index]["entrance"]
+        current_block_entrance.sort()
+        if previous_block_exit == current_block_entrance:
+            continue
+        exist_vars = list(
+            set(previous_block_exit) & set(current_block_entrance))
+        need_add_vars = list(set(current_block_entrance) - set(exist_vars))
+        need_add_vars = find_need_var_from_previous_block(
+            need_add_vars, block_var_detail, index, heter_ops)
+
+        previous_block_private = block_var_detail[index - 1]["private"]
+        previous_block_entrance = block_var_detail[index - 1]["entrance"]
+        for var in need_add_vars:
+            if var not in previous_block_private and var not in previous_block_entrance:
+                previous_block_entrance.append(var)
+            previous_block_exit.append(var)
+    return block_var_detail
+
+
+def find_need_var_from_previous_block(need_add_vars, block_var_detail,
+                                      current_index, heter_ops):
+    # create index_device_map
+    index_device_map = {}
+    for index in range(len(block_var_detail)):
+        index_device_map[index] = DEFAULT_DEVICE
+    for device in heter_ops:
+        for index in heter_ops[device].keys():
+            index_device_map[index] = device
+
+    pre_index = current_index - 1
+    need_ignore_var = []
+
+    # if need_add_var in current device, no need communicate
+    for var in need_add_vars:
+        while (pre_index >= 0):
+            previous_block_private = block_var_detail[pre_index]["private"]
+            previous_block_exit = block_var_detail[pre_index]["exit"]
+            previous_block_entrance = block_var_detail[pre_index]["entrance"]
+            total_var = previous_block_private + previous_block_exit + previous_block_entrance
+            if var in total_var:
+                if index_device_map[current_index] == index_device_map[
+                        pre_index] and index_device_map[
+                            current_index] == DEFAULT_DEVICE:
+                    need_ignore_var.append(var)
+                    break
+            pre_index -= 1
+
+    need_add_vars = list(set(need_add_vars).difference(set(need_ignore_var)))
+    return need_add_vars
+
+
+def delete_block_useless_exit(program, program_block_ops_list,
+                              block_var_detail):
+    for index in range(len(block_var_detail)):
+        if index == len(block_var_detail) - 1:
+            break
+        current_block_exit = block_var_detail[index]["exit"]
+        next_block_entrance = block_var_detail[index + 1]["entrance"]
+        need_delete_var = []
+        for var in current_block_exit:
+            if var not in next_block_entrance:
+                need_delete_var.append(var)
+
+        for var in need_delete_var:
+            current_block_exit.remove(var)
+
+    return block_var_detail
+
+
+def check_op_device(block, device):
+    for op in block.ops:
+        op._set_attr('op_device', device)
+
+
+def screen_persistables(program, var_list):
+    need_remove = []
+    for var_name in var_list:
+        if "@GRAD" in var_name:
+            origin_var_name = var_name.split("@GRAD")[0]
+            var = program.global_block().vars[origin_var_name]
+        else:
+            var = program.global_block().vars[var_name]
+
+        if fluid.io.is_persistable(var):
+            need_remove.append(var_name)
+
+    for var_name in need_remove:
+        var_list.remove(var_name)
+    return need_remove
+
+
+def insert_reshape_op(program,
+                      block,
+                      index,
+                      var_name,
+                      new_var_name,
+                      new_var_shape=None):
+    input_var = program.global_block().vars[var_name]
+
+    if new_var_name not in program.global_block().vars:
+        out = program.global_block().create_var(
+            name=new_var_name,
+            shape=new_var_shape,
+            dtype=input_var.dtype,
+            type=input_var.type)
+    else:
+        out = program.global_block().vars[new_var_name]
+        new_var_shape = out.shape
+
+    x_shape = program.global_block().create_var(
+        name="{}.xshape@Heter".format(var_name), dtype=input_var.dtype)
+    block._insert_op(
+        index=index,
+        type="reshape2",
+        inputs={"X": input_var},
+        attrs={'shape': new_var_shape},
+        outputs={"Out": out,
+                 "XShape": x_shape})
+
+
+def insert_send_concat_op(program, block, index, var_name_list, new_var_name,
+                          new_var_shape):
+    input_var_list = [
+        program.global_block().vars[var_name] for var_name in var_name_list
+    ]
+
+    out = program.global_block().create_var(
+        name=new_var_name,
+        shape=new_var_shape,
+        dtype=input_var_list[0].dtype,
+        type=input_var_list[0].type)
+
+    block._insert_op(
+        index=index,
+        type='concat',
+        inputs={"X": input_var_list},
+        outputs={'Out': [out]},
+        attrs={'axis': -1,
+               'use_stack': False})
+
+
+def insert_recv_slice_op(program, block, index, var_name, var_shape, dtype,
+                         type, new_var_name_list, new_var_shape_list):
+
+    if var_name not in program.global_block().vars:
+        input_var = program.global_block().create_var(
+            name=var_name, shape=var_shape, dtype=dtype, type=type)
+    else:
+        input_var = program.global_block().vars[var_name]
+
+    out_list = []
+    for i in range(len(new_var_name_list)):
+        if new_var_name_list[i] not in program.global_block().vars:
+            out = program.global_block().create_var(
+                name=new_var_name_list[i],
+                shape=new_var_shape_list[i],
+                dtype=input_var.dtype,
+                type=input_var.type)
+        else:
+            out = program.global_block().vars[new_var_name_list[i]]
+        out_list.append(out)
+
+    start_index = 0
+    end_index = 0
+    for i in range(len(new_var_name_list)):
+        starts = []
+        ends = []
+        attrs = {'axes': [1]}
+        end_index += new_var_shape_list[i][1]
+        starts.append(start_index)
+        ends.append(end_index)
+        attrs['starts'] = starts
+        attrs['ends'] = ends
+
+        block._insert_op(
+            index=index,
+            type='slice',
+            inputs={'Input': input_var},
+            attrs=attrs,
+            outputs={'Out': out_list[i]})
+        start_index = end_index
+        index += 1
+
+
+def deleter_trainer_useless_var(program):
+    porgram_useful_var_list = []
+    for op in program.global_block().ops:
+        input_var_list, output_var_list = find_op_input_output(
+            program, program.global_block(), op)
+        op_var_list = list(set(input_var_list).union(set(output_var_list)))
+        porgram_useful_var_list = list(
+            set(porgram_useful_var_list).union(set(op_var_list)))
+
+    program_useless_var_list = list(
+        set(get_vars_name_in_block(program.global_block())).difference(
+            set(porgram_useful_var_list)))
+    for var in program_useless_var_list:
+        program.global_block()._remove_var(var)
+    return program_useless_var_list
+
+
+def block_append_op(program, origin_program, block, op):
+    inputs = _get_input_map_from_op(origin_program.global_block().vars, op)
+    for key, varlist in six.iteritems(inputs):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for var in varlist:
+            if var.name not in program.global_block().vars:
+                program.global_block()._clone_variable(var)
+
+    outputs = _get_output_map_from_op(origin_program.global_block().vars, op)
+    for key, varlist in six.iteritems(outputs):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for var in varlist:
+            if var.name not in program.global_block().vars:
+                program.global_block()._clone_variable(var)
+
+    if "_grad" not in op.type:
+        # for forward op
+        return block.append_op(
+            type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
+    else:
+        # for grad op
+        op_desc = op.desc
+        op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+        backward = core.op_proto_and_checker_maker.OpRole.Backward
+        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
+
+        # append grad op
+        new_op_desc = block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+        new_op_desc._set_attr(op_role_attr_name, backward)
+
+        # set device gard
+        if op.desc.has_attr(device_attr_name):
+            op_device = op_desc.attr(device_attr_name)
+            new_op_desc._set_attr(device_attr_name, op_device)
+        block._sync_with_cpp()
+
+
+def add_vars_by_op_map(var_map, program):
+    for key, varlist in six.iteritems(var_map):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for i in range(len(varlist)):
+            var = varlist[i]
+            if var.name not in program.global_block().vars:
+                program.global_block()._clone_variable(var)
+
+
+def add_vars_by_var_list(var_name_list, origin_program, program):
+    for var_name in var_name_list:
+        if var_name not in program.global_block().vars:
+            var = origin_program.global_block().vars[var_name]
+            program.global_block()._clone_variable(var)
+
+
+def get_varlist_from_op_map(var_map):
+    var_list = []
+    for key, varlist in six.iteritems(var_map):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for i in range(len(varlist)):
+            var = varlist[i]
+            var_list.append(var.name)
+    return var_list
+
+
+def find_ops_list_input_output(program, ops_list):
+    input_var_list = []
+    output_var_list = []
+    for op in ops_list:
+        inputs = _get_input_map_from_op(program.global_block().vars, op)
+        input_var_list += get_varlist_from_op_map(inputs)
+        outputs = _get_output_map_from_op(program.global_block().vars, op)
+        output_var_list += get_varlist_from_op_map(outputs)
+
+    input_var_list = list(set(input_var_list))
+    output_var_list = list(set(output_var_list))
+    return input_var_list, output_var_list
+
+
+def find_op_input_output(program, block, op):
+    input_var_list = []
+    output_var_list = []
+    inputs = _get_input_map_from_op(block.vars, op)
+    input_var_list += get_varlist_from_op_map(inputs)
+    outputs = _get_output_map_from_op(block.vars, op)
+    output_var_list += get_varlist_from_op_map(outputs)
+    input_var_list = list(set(input_var_list))
+    output_var_list = list(set(output_var_list))
+    return input_var_list, output_var_list
+
+
+def get_vars_name_in_block(block):
+    vars_list = block.vars.keys()
+    vars_name_list = [var_name for var_name in vars_list]
+    return vars_name_list
+
+
+def is_same_op(op1, op2):
+    if str(op1) != str(op2):
+        return False
+    return True
+
+
+def _get_input_map_from_op(varmap, op):
+    """Returns a dict from op input name to the vars in varmap."""
+    iomap = collections.OrderedDict()
+    for key in op.input_names:
+        vars = []
+        for varname in op.input(key):
+            if varname == "@EMPTY@":
+                continue
+            if "lod_tensor_blocking_queue" in varname:
+                continue
+            vars.append(varmap[varname])
+        if len(vars) == 1:
+            iomap[key] = vars[0]
+        else:
+            iomap[key] = vars
+    return iomap
+
+
+def _get_output_map_from_op(varmap, op):
+    """Returns a dict from op output name to the vars in varmap."""
+    iomap = collections.OrderedDict()
+    for key in op.output_names:
+        vars = []
+        for varname in op.output(key):
+            if varname == "@EMPTY@":
+                continue
+            if "lod_tensor_blocking_queue" in varname:
+                continue
+            vars.append(varmap[varname])
+        if len(vars) == 1:
+            iomap[key] = vars[0]
+        else:
+            iomap[key] = vars
+    return iomap
+
+
+def delete_same_ops(block, ops):
+    for op in ops:
+        try:
+            for origin_op in block.ops:
+                if is_same_op(origin_op, op):
+                    idx = list(block.ops).index(origin_op)
+                    block._remove_op(idx)
+                    break
+        except Exception as e:
+            print(e)
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1858,6 +1858,7 @@ def conv3d(input,
    return helper.append_activation(pre_act)


+@deprecated(since="2.0.0", update_to="paddle.nn.functional.pool2d")
 @templatedoc()
 def pool2d(input,
           pool_size=-1,
@@ -2075,6 +2076,7 @@ def pool2d(input,
    return pool_out


+@deprecated(since="2.0.0", update_to="paddle.nn.functional.pool3d")
 @templatedoc()
 def pool3d(input,
           pool_size=-1,
@@ -2303,6 +2305,7 @@ def pool3d(input,
    return pool_out


+@deprecated(since="2.0.0", update_to="paddle.nn.functional.adaptive_pool2d")
 @templatedoc(op_type="pool2d")
 def adaptive_pool2d(input,
                    pool_size,
@@ -2450,6 +2453,7 @@ def adaptive_pool2d(input,
    return (pool_out, mask) if require_index else pool_out


+@deprecated(since="2.0.0", update_to="paddle.nn.functional.adaptive_pool3d")
 @templatedoc(op_type="pool3d")
 def adaptive_pool3d(input,
                    pool_size,
@@ -10205,6 +10209,7 @@ def unstack(x, axis=0, num=None):
    return outs


+@deprecated(since='2.0.0', update_to="paddle.expand")
 def expand(x, expand_times, name=None):
    """
    :alias_main: paddle.expand
@@ -10312,6 +10317,7 @@ def expand(x, expand_times, name=None):
    return out


+@deprecated(since='2.0.0', update_to="paddle.expand_as")
 def expand_as(x, target_tensor, name=None):
    """
    :alias_main: paddle.expand_as
@@ -10377,6 +10383,9 @@ def expand_as(x, target_tensor, name=None):
        #(3,20)

    """
+    if in_dygraph_mode():
+        return core.ops.expand_as(x, target_tensor)
+
    check_variable_and_dtype(
        x, 'x', ['float32', 'float64', 'int32', 'int64', 'bool'], 'expand_as')
    check_variable_and_dtype(target_tensor, 'target_tensor',
@@ -15004,6 +15013,7 @@ def gather_tree(ids, parents):
    return out


+@deprecated(since="2.0.0", update_to="paddle.uniform")
 @templatedoc()
 def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
                   name=None):

--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
@@ -17,8 +17,9 @@ from __future__ import print_function
 import os
 import logging
 import tarfile
-
+import tempfile
 import random
+import warnings

 import paddle
 import paddle.fluid.incubate.data_generator as data_generator
@@ -57,7 +58,7 @@ def load_dnn_input_record(sent):
 def load_lr_input_record(sent):
    res = []
    for _ in [x.split(':') for x in sent.split()]:
-        res.append(int(_[0]))
+        res.append(int(_[0]) % 10000)
    return res


@@ -120,9 +121,62 @@ def prepare_data():
    lr_input_dim = res[1]
    logger.info('dnn input dim: %d' % dnn_input_dim)
    logger.info('lr input dim: %d' % lr_input_dim)
+
    return dnn_input_dim, lr_input_dim, train_file_path


+def gen_fake_line(dnn_data_num=7,
+                  dnn_data_range=1e5,
+                  lr_data_num=5,
+                  lr_data_range=1e5):
+    line = ""
+
+    # for deep data
+    for index in range(dnn_data_num):
+        data = str(random.randint(0, dnn_data_range - 1))
+        if index < dnn_data_num - 1:
+            data += " "
+        line += data
+    line += "\t"
+
+    # for wide data
+    for index in range(lr_data_num):
+        data = str(random.randint(0, lr_data_range - 1)) + ":" + str(1)
+        if index < lr_data_num - 1:
+            data += " "
+        line += data
+    line += "\t"
+
+    # for label
+    line += str(random.randint(0, 1))
+    line += "\n"
+    return line
+
+
+def prepare_fake_data(file_nums=8, file_lines=1000):
+    """
+    Create fake data with same type as avazu_ctr_data
+    """
+    file_dir = tempfile.mkdtemp()
+    warnings.warn("Fake data write in {}".format(file_dir))
+    for file_index in range(file_nums):
+        with open(
+                os.path.join(file_dir,
+                             "ctr_train_data_part_{}".format(file_index)),
+                'w+') as fin:
+            file_str = ""
+            for line_index in range(file_lines):
+                file_str += gen_fake_line()
+            fin.write(file_str)
+            warnings.warn("Write done ctr_train_data_part_{}".format(
+                file_index))
+
+    file_list = [os.path.join(file_dir, x) for x in os.listdir(file_dir)]
+    assert len(file_list) == file_nums
+
+    return file_list
+
+
 if __name__ == "__main__":
    pairwise_reader = DatasetCtrReader()
    pairwise_reader.run_from_stdin()
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Distribute CTR model for test fleet api
+"""
+
+from __future__ import print_function
+
+import shutil
+import tempfile
+import time
+
+import paddle
+import paddle.fluid as fluid
+import os
+import numpy as np
+
+import ctr_dataset_reader
+from test_dist_fleet_heter_base import runtime_main, FleetDistHeterRunnerBase
+from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
+from paddle.distributed.fleet.base.util_factory import fleet_util
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
+    """
+    For test CTR model, using Fleet api
+    """
+
+    def net(self, args, batch_size=4, lr=0.01):
+        """
+        network definition
+
+        Args:
+            batch_size(int): the size of mini-batch for training
+            lr(float): learning rate of training
+        Returns:
+            avg_cost: LoDTensor of cost.
+        """
+        dnn_input_dim, lr_input_dim = int(1e5), int(1e5)
+
+        dnn_data = fluid.layers.data(
+            name="dnn_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        lr_data = fluid.layers.data(
+            name="lr_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        label = fluid.layers.data(
+            name="click",
+            shape=[-1, 1],
+            dtype="float32",
+            lod_level=0,
+            append_batch_size=False)
+
+        datas = [dnn_data, lr_data, label]
+
+        if args.reader == "pyreader":
+            self.reader = fluid.io.PyReader(
+                feed_list=datas,
+                capacity=64,
+                iterable=False,
+                use_double_buffer=False)
+
+        # build dnn model
+        dnn_layer_dims = [128, 64, 32, 1]
+        dnn_embedding = fluid.layers.embedding(
+            is_distributed=False,
+            input=dnn_data,
+            size=[dnn_input_dim, dnn_layer_dims[0]],
+            param_attr=fluid.ParamAttr(
+                name="deep_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+        dnn_pool = fluid.layers.sequence_pool(
+            input=dnn_embedding, pool_type="sum")
+        dnn_out = dnn_pool
+
+        # build lr model
+        lr_embbding = fluid.layers.embedding(
+            is_distributed=False,
+            input=lr_data,
+            size=[lr_input_dim, 1],
+            param_attr=fluid.ParamAttr(
+                name="wide_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+        lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
+
+        with fluid.device_guard("gpu"):
+            for i, dim in enumerate(dnn_layer_dims[1:]):
+                fc = fluid.layers.fc(
+                    input=dnn_out,
+                    size=dim,
+                    act="relu",
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(value=0.01)),
+                    name='dnn-fc-%d' % i)
+                dnn_out = fc
+
+            merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
+            label = fluid.layers.cast(label, dtype="int64")
+            predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            fluid.layers.Print(avg_cost, message="avg_cost")
+
+        self.feeds = datas
+        self.train_file_path = ["fake1", "fake2"]
+        self.avg_cost = avg_cost
+        self.predict = predict
+
+        return avg_cost
+
+    def check_model_right(self, dirname):
+        model_filename = os.path.join(dirname, "__model__")
+
+        with open(model_filename, "rb") as f:
+            program_desc_str = f.read()
+
+        program = fluid.Program.parse_from_string(program_desc_str)
+        with open(os.path.join(dirname, "__model__.proto"), "w") as wn:
+            wn.write(str(program))
+
+    def do_pyreader_training(self, fleet):
+        """
+        do training using dataset, using fetch handler to catch variable
+        Args:
+            fleet(Fleet api): the fleet object of Parameter Server, define distribute training role
+        """
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        fleet.init_worker()
+        exe.run(fluid.default_startup_program())
+        batch_size = 4
+        train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
+        self.reader.decorate_sample_list_generator(train_reader)
+
+        for epoch_id in range(1):
+            self.reader.start()
+            try:
+                pass_start = time.time()
+                while True:
+                    exe.run(program=fluid.default_main_program())
+
+                pass_time = time.time() - pass_start
+            except fluid.core.EOFException:
+                self.reader.reset()
+
+        fleet.stop_worker()
+
+    def do_dataset_training(self, fleet):
+        train_file_list = ctr_dataset_reader.prepare_fake_data()
+
+        exe = fluid.Executor(fluid.CPUPlace())
+
+        fleet.init_worker()
+        exe.run(fluid.default_startup_program())
+
+        thread_num = 1
+        batch_size = 128
+        filelist = fleet_util.get_file_shard(train_file_list)
+        print("filelist: {}".format(filelist))
+
+        # config dataset
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
+        dataset.set_batch_size(batch_size)
+        dataset.set_use_var(self.feeds)
+        pipe_command = 'python ctr_dataset_reader.py'
+        dataset.set_pipe_command(pipe_command)
+
+        dataset.set_filelist(filelist)
+        dataset.set_thread(thread_num)
+
+        for epoch_id in range(1):
+            pass_start = time.time()
+            dataset.set_filelist(filelist)
+            exe.train_from_dataset(
+                program=fluid.default_main_program(),
+                dataset=dataset,
+                fetch_list=[self.avg_cost],
+                fetch_info=["cost"],
+                print_period=2,
+                debug=int(os.getenv("Debug", "0")))
+            pass_time = time.time() - pass_start
+            print("do_dataset_training done. using time {}".format(pass_time))
+        if os.getenv("SAVE_MODEL") == "1":
+            model_dir = tempfile.mkdtemp()
+            fleet.save_inference_model(exe, model_dir,
+                                       [feed.name for feed in self.feeds],
+                                       self.avg_cost)
+            self.check_model_right(model_dir)
+            shutil.rmtree(model_dir)
+
+        fleet.stop_worker()
+        print("do_dataset_training stop worker.")
+
+
+if __name__ == "__main__":
+    runtime_main(TestHeterPsCTR2x2)
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
+import paddle.fluid as fluid


 class TestAdadeltaOp1(OpTest):
@@ -108,5 +110,54 @@ class TestAdadeltaOp2(OpTest):
        self.check_output()


+class TestAdadeltaV2(unittest.TestCase):
+    def test_adadelta_dygraph(self):
+        paddle.disable_static(paddle.CPUPlace())
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Adadelta(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adadelta(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.Adadelta(learning_rate=0.1)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(ValueError, paddle.optimizer.Adadelta, None)
+        self.assertRaises(
+            ValueError, paddle.optimizer.Adadelta, learning_rate=0.1, rho=None)
+        self.assertRaises(
+            ValueError,
+            paddle.optimizer.Adadelta,
+            learning_rate=0.1,
+            epsilon=None)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def avg_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        field_size = (r_end - r_start) \
+            if (exclusive or adaptive) else (ksize[0])
+        if data_type == np.int8 or data_type == np.uint8:
+            out[:, :, i] = (np.rint(
+                np.sum(x_masked, axis=(2, 3)) / field_size)).astype(data_type)
+        else:
+            out[:, :, i] = (np.sum(x_masked, axis=(2)) /
+                            field_size).astype(data_type)
+    return out
+
+
+class TestPool1d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_adaptive_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def test_adaptive_avg_pool1d(self):
+        for place in self.places:
+            self.check_adaptive_avg_dygraph_results(place)
+            self.check_adaptive_avg_static_results(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def max_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        out[:, :, i] = np.max(x_masked, axis=(2))
+    return out
+
+
+class TestPool1d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_adaptive_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def test_adaptive_max_pool1d(self):
+        for place in self.places:
+            self.check_adaptive_max_dygraph_results(place)
+            self.check_adaptive_max_static_results(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
+                            pool_type="max"):
+
+    N = x.shape[0]
+    C, H, W = [x.shape[1], x.shape[2], x.shape[3]] if data_format == 'NCHW' \
+        else [x.shape[3], x.shape[1], x.shape[2]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        output_size = [H_out, W_out]
+    else:
+        H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = H
+        H_out = H
+    if output_size[1] == None:
+        output_size[1] = W
+        W_out = W
+
+    out = np.zeros((N, C, H_out, W_out)) if data_format=='NCHW' \
+        else np.zeros((N, H_out, W_out, C))
+
+    for i in range(H_out):
+        in_h_start = adaptive_start_index(i, H, output_size[0])
+        in_h_end = adaptive_end_index(i, H, output_size[0])
+
+        for j in range(W_out):
+            in_w_start = adaptive_start_index(j, W, output_size[1])
+            in_w_end = adaptive_end_index(j, W, output_size[1])
+
+            if data_format == 'NCHW':
+                x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
+                elif pool_type == 'max':
+                    out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+            elif data_format == 'NHWC':
+                x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
+                elif pool_type == 'max':
+                    out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
+    return out
+
+
+class TestAdaptiveMaxPool2dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="max")
+        """
+        self.res_4_np = adaptive_pool2d_forward(
+            x=self.x_np,
+            output_size=[3, 3],
+            pool_type="max",
+            data_format="NHWC")
+        """
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[2, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool2d(
+            #    x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[None, 3])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, return_indices=False, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[2, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool2d(
+            #    x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[None, 3])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="max")
+
+        #self.res_4_np = adaptive_pool2d_forward(
+        #    x=self.x_np,
+        #    output_size=[3, 3],
+        #    pool_type="max",
+        #    data_format="NHWC")
+
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[2, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #    adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            #        output_size=[3, 3], data_format="NHWC")
+            #    out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_max_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[2, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            #    output_size=[3, 3], data_format="NHWC")
+            #out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_max_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool3d_forward(x,
+                            output_size,
+                            adaptive=True,
+                            data_format='NCDHW',
+                            pool_type='max'):
+
+    N = x.shape[0]
+    C, D, H, W = [x.shape[1], x.shape[2], x.shape[3], x.shape[4]] \
+        if data_format == 'NCDHW' else [x.shape[4], x.shape[1], x.shape[2],x.shape[3]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        D_out = output_size
+        output_size = [D_out, H_out, W_out]
+    else:
+        D_out, H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = D
+        D_out = D
+    if output_size[1] == None:
+        output_size[1] = H
+        H_out = H
+    if output_size[2] == None:
+        output_size[2] = W
+        W_out = W
+
+    out = np.zeros((N, C, D_out, H_out, W_out)) if data_format=='NCDHW' \
+        else np.zeros((N, D_out, H_out, W_out, C))
+    for k in range(D_out):
+        d_start = adaptive_start_index(k, D, output_size[0])
+        d_end = adaptive_end_index(k, D, output_size[0])
+
+        for i in range(H_out):
+            h_start = adaptive_start_index(i, H, output_size[1])
+            h_end = adaptive_end_index(i, H, output_size[1])
+
+            for j in range(W_out):
+                w_start = adaptive_start_index(j, W, output_size[2])
+                w_end = adaptive_end_index(j, W, output_size[2])
+
+                if data_format == 'NCDHW':
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
+                                 w_end]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, :, k, i, j] = np.sum(x_masked,
+                                                    axis=(2, 3, 4)) / field_size
+                    elif pool_type == 'max':
+                        out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+
+                elif data_format == 'NDHWC':
+                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
+                                 w_end, :]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, k, i, j, :] = np.sum(x_masked,
+                                                    axis=(1, 2, 3)) / field_size
+                    elif pool_type == 'max':
+                        out[:, k, i, j, :] = np.max(x_masked, axis=(1, 2, 3))
+    return out
+
+
+class TestAdaptiveMaxPool3dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="max")
+
+        self.res_4_np = adaptive_pool3d_forward(
+            x=self.x_np,
+            output_size=[3, 3, 3],
+            pool_type="max",
+            data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool3d(
+            #    x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool3d(
+            #    x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="max")
+
+        # self.res_4_np = adaptive_pool3d_forward(
+        #     x=self.x_np,
+        #     output_size=[3, 3, 3],
+        #     pool_type="max",
+        #     data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            #         output_size=[3, 3, 3], data_format="NDHWC")
+            #     out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_max_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #     assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            #         output_size=[3, 3, 3], data_format="NDHWC")
+            #     out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_max_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #     assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -85,10 +85,35 @@ class TestBatchNorm(unittest.TestCase):
                    y = bn(fluid.dygraph.to_variable(x))
                return y.numpy()

+            def compute_v3(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(1.0),
+                            trainable=False),
+                        bias_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(0.0),
+                            trainable=False),
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v4(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2d(
+                        shape[1], weight_attr=False, bias_attr=False)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
            x = np.random.randn(*shape).astype("float32")
            y1 = compute_v1(x, False, False)
            y2 = compute_v2(x)
+            y3 = compute_v3(x, False, False)
+            y4 = compute_v4(x)
            self.assertTrue(np.allclose(y1, y2))
+            self.assertTrue(np.allclose(y3, y4))

    def test_static(self):
        places = [fluid.CPUPlace()]

--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -166,12 +166,16 @@ class TestClipAPI(unittest.TestCase):
        data_shape = [1, 9, 9, 4]
        data = np.random.random(data_shape).astype('float32')
        images = paddle.to_variable(data, dtype='float32')
+        v_min = paddle.to_variable(np.array([0.2], dtype=np.float32))
+        v_max = paddle.to_variable(np.array([0.8], dtype=np.float32))

        out_1 = paddle.clip(images, min=0.2, max=0.8)
        out_2 = paddle.clip(images, min=0.2, max=0.9)
+        out_3 = paddle.clip(images, min=v_min, max=v_max)

        self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
        self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
+        self.assertTrue(np.allclose(out_3.numpy(), data.clip(0.2, 0.8)))

    def test_errors(self):
        paddle.enable_static()

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+"""
+    high level unit test for distribute fleet.
+"""
+
+import os
+import sys
+import subprocess
+
+import six
+import shutil
+import numpy as np
+import argparse
+from contextlib import closing
+import socket
+import time
+import tempfile
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.base.util_factory import fleet_util
+from paddle.distributed.fleet import fleet
+
+__all__ = ['FleetDistHeterRunnerBase', 'TestFleetHeterBase', 'runtime_main']
+
+RUN_STEP = 5
+LEARNING_RATE = 0.01
+DIST_UT_PORT = 0
+
+
+class FleetDistHeterRunnerBase(object):
+    """
+        run_pserver,run_trainer : after init role, using transpiler split program
+        net : implment by child class, the network of model
+        do training : exe run program
+    """
+
+    def build_role(self, args):
+        environs = {}
+        environs["PADDLE_PSERVERS_IP_PORT_LIST"] = args.endpoints
+        environs["PADDLE_TRAINER_ENDPOINTS"] = args.trainer_endpoints
+        environs[
+            "PADDLE_HETER_TRAINER_IP_PORT_LIST"] = args.heter_trainer_endpoints
+        environs["PADDLE_HETER_TRAINER_DEVICE"] = args.heter_trainer_device
+        environs["TRAINING_ROLE"] = args.role.upper()
+        environs["PADDLE_TRAINERS_NUM"] = args.trainers
+        environs["PADDLE_TRAINER_ID"] = args.current_id
+        if args.role.upper() == "PSERVER":
+            environs["POD_IP"] = args.endpoints.split(",")[int(
+                args.current_id)].split(":")[0]
+            environs["PADDLE_PORT"] = args.endpoints.split(",")[int(
+                args.current_id)].split(":")[1]
+        elif args.role.upper() == "HETER_TRAINER":
+            environs["POD_IP"] = args.heter_trainer_endpoints.split(",")[int(
+                args.current_id)].split(":")[0]
+            environs["PADDLE_PORT"] = args.heter_trainer_endpoints.split(",")[
+                int(args.current_id)].split(":")[1]
+            environs["FLAGS_selected_gpus"] = args.current_id
+
+        for k, v in environs.items():
+            os.environ[k] = str(v)
+
+        self.role = role_maker.PaddleCloudRoleMaker()
+        return self.role
+
+    def build_strategy(self, args):
+        self.strategy = paddle.distributed.fleet.DistributedStrategy()
+        self.strategy.a_sync = True
+
+        return self.strategy
+
+    def build_optimizer(self, avg_cost, strategy):
+        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+    def run_pserver(self, args):
+        fleet.init_server()
+        fleet.run_server()
+
+    def run_dataset_trainer(self, args):
+        out = self.do_dataset_training(fleet)
+
+    def run_pyreader_trainer(self, args):
+        out = self.do_pyreader_training(fleet)
+
+    def net(self, args, batch_size=4, lr=0.01):
+        raise NotImplementedError(
+            "get_model should be implemented by child classes.")
+
+    def do_dataset_training(self, fleet):
+        raise NotImplementedError(
+            "do_dataset_training should be implemented by child classes.")
+
+    def do_pyreader_training(self, fleet):
+        raise NotImplementedError(
+            "do_pyreader_training should be implemented by child classes.")
+
+
+class TestFleetHeterBase(unittest.TestCase):
+    """
+        start_pserver,start_trainer : add start cmd to test
+        run_cluster : using multi process to test distribute program
+    """
+
+    def _setup_config(self):
+        raise NotImplementedError("tests should have _setup_config implemented")
+
+    def tearDown(self):
+        t = time.time() - self.startTime
+        print('%s: %.3f' % (self.__class__.__name__, t))
+
+    def setUp(self):
+        self.startTime = time.time()
+
+        self._mode = "async"
+        self._reader = "pyreader"
+        self._trainers = 2
+        self._pservers = 2
+        self._port_set = set()
+
+        self._heter_device = "gpu"
+
+        global DIST_UT_PORT
+        if DIST_UT_PORT == 0 and os.getenv("PADDLE_DIST_UT_PORT"):
+            DIST_UT_PORT = int(os.getenv("PADDLE_DIST_UT_PORT"))
+
+        if DIST_UT_PORT:
+            print("set begin_port:", DIST_UT_PORT)
+            self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT, DIST_UT_PORT + 1)
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT + 2, DIST_UT_PORT + 3)
+            self._heter_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT + 4, DIST_UT_PORT + 5)
+            DIST_UT_PORT += 6
+        else:
+            self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())
+            self._heter_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())
+
+        self._python_interp = sys.executable
+        self._geo_sgd_need_push_nums = 5
+        self._grad_clip_mode = 0
+        self._setup_config()
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _start_pserver(self, cmd, required_envs):
+        ps0_cmd, ps1_cmd = cmd.format(0), cmd.format(1)
+
+        ps0_pipe = open(tempfile.gettempdir() + "/ps0_err.log", "wb+")
+        ps1_pipe = open(tempfile.gettempdir() + "/ps1_err.log", "wb+")
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps0_pipe,
+            env=required_envs)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps1_pipe,
+            env=required_envs)
+        return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
+
+    def _start_trainer(self, cmd, required_envs):
+        tr0_cmd, tr1_cmd = cmd.format(0), cmd.format(1)
+
+        tr0_pipe = open(tempfile.gettempdir() + "/tr0_err.log", "wb+")
+        tr1_pipe = open(tempfile.gettempdir() + "/tr1_err.log", "wb+")
+
+        tr0_out = open(tempfile.gettempdir() + "/tr0_out.log", "wb+")
+        tr1_out = open(tempfile.gettempdir() + "/tr1_out.log", "wb+")
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(" "),
+            stdout=tr0_out,
+            stderr=tr0_pipe,
+            env=required_envs)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.strip().split(" "),
+            stdout=tr1_out,
+            stderr=tr1_pipe,
+            env=required_envs)
+
+        return tr0_proc, tr1_proc, tr0_pipe, tr1_pipe
+
+    def _start_heter_trainer(self, cmd, required_envs):
+        heter0_cmd, heter1_cmd = cmd.format(0), cmd.format(1)
+
+        heter0_pipe = open(tempfile.gettempdir() + "/heter0_err.log", "wb+")
+        heter1_pipe = open(tempfile.gettempdir() + "/heter1_err.log", "wb+")
+        heter0_out = open(tempfile.gettempdir() + "/heter0_out.log", "wb+")
+        heter1_out = open(tempfile.gettempdir() + "/heter1_out.log", "wb+")
+
+        heter0_proc = subprocess.Popen(
+            heter0_cmd.strip().split(" "),
+            stdout=heter0_out,
+            stderr=heter0_pipe,
+            env=required_envs)
+        heter1_proc = subprocess.Popen(
+            heter1_cmd.strip().split(" "),
+            stdout=heter1_out,
+            stderr=heter1_pipe,
+            env=required_envs)
+
+        return heter0_proc, heter1_proc, heter0_pipe, heter1_pipe
+
+    def _run_cluster(self, model, envs):
+        env = {'GRAD_CLIP': str(self._grad_clip_mode)}
+        python_path = self._python_interp
+        gloo_path = tempfile.mkdtemp()
+
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
+            python_path += " -m coverage run --branch -p"
+        env.update(envs)
+
+        tr_cmd = "{0} {1} --role trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8} --heter_trainer_endpoints {9} --heter_trainer_device {10}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path, self._heter_endpoints, self._heter_device)
+
+        ps_cmd = "{0} {1} --role pserver --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8} --heter_trainer_endpoints {9} --heter_trainer_device {10}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path, self._heter_endpoints, self._heter_device)
+
+        heter_cmd = "{0} {1} --role heter_trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8} --heter_trainer_endpoints {9} --heter_trainer_device {10}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path, self._heter_endpoints, self._heter_device)
+
+        # Run dist train to compare with local results
+        ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
+        tr0, tr1, tr0_pipe, tr1_pipe = self._start_trainer(tr_cmd, env)
+        heter0, heter1, heter0_pipe, heter1_pipe = self._start_heter_trainer(
+            heter_cmd, env)
+
+        # Wait until trainer process terminate
+        while True:
+            stat0 = tr0.poll()
+            time.sleep(0.1)
+            if stat0 is not None:
+                break
+
+        while True:
+            stat1 = tr1.poll()
+            time.sleep(0.1)
+            if stat1 is not None:
+                break
+
+        tr0_out, tr0_err = tr0.communicate()
+        tr1_out, tr1_err = tr1.communicate()
+        print("tr end communicate")
+
+        tr0_ret = tr0.returncode
+        tr1_ret = tr0.returncode
+        print("tr get returncode: {}".format(tr0_ret))
+        if tr0_ret != 0:
+            print(
+                "========================Error tr0_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log"))
+            print(
+                "========================Error tr0_err end==========================="
+            )
+
+        if tr1_ret != 0:
+            print(
+                "========================Error tr1_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log"))
+            print(
+                "========================Error tr1_err end==========================="
+            )
+
+        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
+        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
+
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        ps0_pipe.close()
+        ps1_pipe.close()
+        heter0_pipe.close()
+        heter1_pipe.close()
+
+        ps0.terminate()
+        ps1.terminate()
+        heter0.terminate()
+        heter1.terminate()
+
+        shutil.rmtree(gloo_path)
+        return 0, 0
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": ""
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+
+def runtime_main(test_class):
+    parser = argparse.ArgumentParser(description='Run Fleet test.')
+    parser.add_argument(
+        '--role',
+        type=str,
+        required=True,
+        choices=['pserver', 'trainer', 'heter_trainer'])
+    parser.add_argument('--endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--trainer_endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--heter_trainer_endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--heter_trainer_device', type=str, required=False, default="gpu")
+    parser.add_argument('--gloo_path', type=str, required=False, default="")
+    parser.add_argument('--current_id', type=int, required=False, default=0)
+    parser.add_argument('--trainers', type=int, required=False, default=1)
+    parser.add_argument('--mode', type=str, required=False, default='async')
+    parser.add_argument(
+        '--geo_sgd_need_push_nums', type=int, required=False, default=2)
+    parser.add_argument('--reader', type=str, required=False, default='dataset')
+    args = parser.parse_args()
+
+    model = test_class()
+    role = model.build_role(args)
+    fleet.init(role)
+    strategy = model.build_strategy(args)
+    avg_cost = model.net(args)
+    model.build_optimizer(avg_cost, strategy)
+    fleet_util._set_strategy(strategy)
+    fleet_util._set_role_maker(role)
+
+    if args.role == "pserver" or args.role == "heter_trainer":
+        model.run_pserver(args)
+    else:
+        if args.reader == "dataset":
+            model.run_dataset_trainer(args)
+        else:
+            model.run_pyreader_trainer(args)
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import tempfile
+from test_dist_fleet_heter_base import TestFleetHeterBase
+
+
+class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
+    def _setup_config(self):
+        self._mode = "async"
+        self._reader = "dataset"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "1"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "4"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_heter_ctr.py", delta=1e-5, check_error_log=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import paddle
+import os
+import math
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.base.util_factory import fleet_util
+from paddle.distributed.fleet import fleet
+
+
+class TestDistFleetHeterProgram(unittest.TestCase):
+    def build_role(self):
+        environs = {}
+        environs[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36012,127.0.0.1:36013"
+        environs["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36014,127.0.0.1:36015"
+        environs[
+            "PADDLE_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:36016,127.0.0.1:36017"
+        environs["PADDLE_HETER_TRAINER_DEVICE"] = "gpu"
+        environs["TRAINING_ROLE"] = "HETER_TRAINER"
+        environs["PADDLE_TRAINERS_NUM"] = 2
+        environs["PADDLE_TRAINER_ID"] = 0
+        environs["POD_IP"] = "127.0.0.1"
+        environs["PADDLE_PORT"] = "36016"
+        environs["FLAGS_selected_gpus"] = 0
+
+        for k, v in environs.items():
+            os.environ[k] = str(v)
+
+        self.role = role_maker.PaddleCloudRoleMaker()
+        return self.role
+
+    def build_strategy(self):
+        self.strategy = paddle.distributed.fleet.DistributedStrategy()
+        self.strategy.a_sync = True
+        return self.strategy
+
+    def build_input(self):
+        dense_input = fluid.layers.data(
+            name="dense_input", shape=[10], dtype="float32")
+
+        sparse_input_ids = [
+            fluid.layers.data(
+                name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
+            for i in range(1, 27)
+        ]
+
+        label = fluid.layers.data(name="label", shape=[1], dtype="float32")
+
+        inputs = [dense_input] + sparse_input_ids + [label]
+        return inputs
+
+    def build_net(self, inputs):
+        def embedding_layer(input):
+            return fluid.layers.embedding(
+                input=input,
+                is_sparse=True,
+                size=[100001, 10],
+                param_attr=fluid.ParamAttr(
+                    name="SparseFeatFactors",
+                    initializer=fluid.initializer.Uniform()), )
+
+        sparse_embed_seq = list(map(embedding_layer, inputs[1:-1]))
+
+        concated = fluid.layers.concat(sparse_embed_seq + inputs[0:1], axis=1)
+
+        with fluid.device_guard("gpu"):
+            fc1 = fluid.layers.fc(
+                input=concated,
+                size=400,
+                act="relu",
+                param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                    scale=1 / math.sqrt(concated.shape[1]))),
+                name="fc1")
+
+        with fluid.device_guard("cpu"):
+            fc2 = fluid.layers.fc(input=fc1,
+                                  size=400,
+                                  act="relu",
+                                  param_attr=fluid.ParamAttr(
+                                      initializer=fluid.initializer.Normal(
+                                          scale=1 / math.sqrt(fc1.shape[1]))),
+                                  name="fc2")
+
+        with fluid.device_guard("gpu"):
+            fc3 = fluid.layers.fc(input=fc2,
+                                  size=400,
+                                  act="relu",
+                                  param_attr=fluid.ParamAttr(
+                                      initializer=fluid.initializer.Normal(
+                                          scale=1 / math.sqrt(fc2.shape[1]))),
+                                  name="fc3")
+
+        with fluid.device_guard("cpu"):
+            predict = fluid.layers.fc(
+                input=fc3,
+                size=2,
+                act="softmax",
+                param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                    scale=1 / math.sqrt(fc3.shape[1]))), )
+
+        with fluid.device_guard("gpu"):
+            labels = fluid.layers.cast(inputs[-1], dtype="int64")
+            cost = fluid.layers.cross_entropy(input=predict, label=labels)
+            avg_cost = fluid.layers.reduce_sum(cost)
+
+        return avg_cost
+
+    def build_optimizer(self, avg_cost, strategy):
+        optimizer = fluid.optimizer.SGD(1e-2)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+    def test(self):
+        role = self.build_role()
+        fleet.init(role)
+        strategy = self.build_strategy()
+        inputs = self.build_input()
+        avg_cost = self.build_net(inputs)
+        self.build_optimizer(avg_cost, strategy)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_expand_as_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_op.py
@@ -102,8 +102,23 @@ class TestExpandAsOpRank4(OpTest):
        self.check_grad(['X'], 'Out')


+# Test dygraph API
+class TestExpandAsDygraphAPI(unittest.TestCase):
+    def test_api(self):
+        import paddle
+        paddle.disable_static()
+        np_data_x = np.array([1, 2, 3]).astype('int32')
+        np_data_y = np.array([1, 2, 3, 1, 2, 3]).astype('int32')
+        data_x = paddle.to_tensor(np_data_x)
+        data_y = paddle.to_tensor(np_data_y)
+        out = fluid.layers.expand_as(data_x, data_y)
+        np_out = out.numpy()
+        assert np.array_equal(np_out, np.tile(np_data_x, (2)))
+        paddle.enable_static()
+
+
 # Test python API
-class TestExpandAPI(unittest.TestCase):
+class TestExpandAsAPI(unittest.TestCase):
    def test_api(self):
        input1 = np.random.random([12, 14]).astype("float32")
        input2 = np.random.random([48, 14]).astype("float32")

--- a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
@@ -43,7 +43,7 @@ class TestFleetBase(unittest.TestCase):
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        strategy = fleet.DistributedStrategy()
-        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)


--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -22,6 +22,7 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from op_test import OpTest
+import paddle


 class TestGaussianRandomOp(OpTest):
@@ -235,6 +236,56 @@ class TestGaussianRandomAPI(unittest.TestCase):
        self.assertAlmostEqual(np.mean(res_6), 0.0, delta=0.1)
        self.assertAlmostEqual(np.std(res_6), 1., delta=0.1)

+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.gaussian_random([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.gaussian_random([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.gaussian_random([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+
+
+class TestStandardNormalDtype(unittest.TestCase):
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.standard_normal([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.standard_normal([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.standard_normal([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -658,7 +658,7 @@ class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
 class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
        optimizer = PipelineOptimizer(optimizer)
        return optimizer

@@ -670,7 +670,7 @@ class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
 class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
        optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5)
        return optimizer

@@ -682,7 +682,7 @@ class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
 class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
    def get_optimizer_dygraph(self, parameter_list):
        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
        optimizer = RecomputeOptimizer(optimizer)
        return optimizer


--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -299,7 +299,7 @@ class TestLayer(LayerTest):
                my_syncbn = paddle.nn.SyncBatchNorm(3)
                dy_ret = my_syncbn(base.to_variable(t))
                dy_ret_value = dy_ret.numpy()
-            self.assertTrue(np.array_equal(static_ret, static_ret))
+            self.assertTrue(np.array_equal(static_ret, dy_ret_value))

    def test_relu(self):
        with self.static_graph():

--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -19,6 +19,8 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from op_test import OpTest
+import paddle
+import paddle.fluid as fluid


 class TestMomentumOp1(OpTest):
@@ -234,5 +236,48 @@ class TestSparseMomentumOp2(TestSparseMomentumOp):
        self.use_nesterov = True


+class TestMomentumV2(unittest.TestCase):
+    def test_momentum_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9, parameters=linear.parameters())
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_momentum(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.Momentum(
+                learning_rate=0.1, momentum=0.9)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(
+            ValueError, paddle.optimizer.Momentum, learning_rate=None)
+        self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -174,66 +174,6 @@ class TestPool1d_API(unittest.TestCase):
            result = max_pool1d_dg(input)
            self.assertTrue(np.allclose(result.numpy(), result_np))

-    def check_adaptive_max_dygraph_results(self, place):
-        with fluid.dygraph.guard(place):
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            input = fluid.dygraph.to_variable(input_np)
-            result = F.adaptive_max_pool1d(input, output_size=16)
-
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
-                output_size=16)
-            result = ada_max_pool1d_dg(input)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-    def check_adaptive_avg_dygraph_results(self, place):
-        with fluid.dygraph.guard(place):
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            input = fluid.dygraph.to_variable(input_np)
-            result = F.adaptive_avg_pool1d(input, output_size=16)
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
-
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
-                output_size=16)
-            result = ada_max_pool1d_dg(input)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-    def check_adaptive_max_static_results(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
-            result = F.adaptive_max_pool1d(input, output_size=16)
-
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
-
-            exe = fluid.Executor(place)
-            fetches = exe.run(fluid.default_main_program(),
-                              feed={"input": input_np},
-                              fetch_list=[result])
-            self.assertTrue(np.allclose(fetches[0], result_np))
-
-    def check_adaptive_avg_static_results(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
-            result = F.adaptive_avg_pool1d(input, output_size=16)
-
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
-
-            exe = fluid.Executor(place)
-            fetches = exe.run(fluid.default_main_program(),
-                              feed={"input": input_np},
-                              fetch_list=[result])
-            self.assertTrue(np.allclose(fetches[0], result_np))
-
    def check_max_dygraph_padding_same(self, place):
        with fluid.dygraph.guard(place):
            input_np = np.random.random([2, 3, 32]).astype("float32")
@@ -265,10 +205,6 @@ class TestPool1d_API(unittest.TestCase):
            self.check_avg_dygraph_results(place)
            self.check_max_static_results(place)
            self.check_avg_static_results(place)
-            self.check_adaptive_max_dygraph_results(place)
-            self.check_adaptive_avg_dygraph_results(place)
-            self.check_adaptive_max_static_results(place)
-            self.check_adaptive_avg_static_results(place)
            self.check_max_dygraph_padding_same(place)
            self.check_avg_dygraph_padding_same(place)


--- a/python/paddle/fluid/tests/unittests/test_rand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rand_op.py
@@ -21,6 +21,7 @@ import paddle.fluid.core as core
 from paddle import rand
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+import paddle


 class TestRandOpError(unittest.TestCase):
@@ -115,5 +116,31 @@ class TestRandOpForDygraph(unittest.TestCase):
            self.run_net(True)


+class TestRandDtype(unittest.TestCase):
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.rand([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.rand([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.rand([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -20,6 +20,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from op_test import OpTest
+import paddle


 class TestSGDOp(OpTest):
@@ -208,5 +209,46 @@ class TestSGDOpWithLargeInput(unittest.TestCase):
        result = exe.run(compiled_prog, fetch_list=[avg_cost])


+class TestSGDV2(unittest.TestCase):
+    def test_sgd_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.SGD(learning_rate=0.01,
+                                    parameters=linear.parameters(),
+                                    weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_sgd(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(ValueError, paddle.optimizer.SGD, learning_rate=None)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -221,5 +221,21 @@ class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
            self.assertRaises(TypeError, my_sync_batch_norm, x2)


+class TestConvertSyncBatchNorm(unittest.TestCase):
+    def test_convert(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        with program_guard(Program(), Program()):
+            model = paddle.nn.Sequential(
+                paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
+            sync_model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            for idx, sublayer in enumerate(model.sublayers()):
+                if isinstance(sublayer, paddle.nn.BatchNorm2d):
+                    self.assertEqual(
+                        isinstance(sync_model[idx], paddle.nn.SyncBatchNorm),
+                        True)
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -536,5 +536,31 @@ class TestUniformDygraphMode(unittest.TestCase):
                self.assertTrue((x_np[i] > 0 and x_np[i] < 1.0))


+class TestUniformDtype(unittest.TestCase):
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.uniform([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.uniform([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.uniform([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/incubate/hapi/model.py
@@ -891,10 +891,11 @@ class Model(object):

                class Mnist(paddle.nn.Layer):
                    def __init__(self):
-                        super(MyNet, self).__init__()
-                        self._fc = Linear(784, 1, act='softmax')
+                        super(Mnist, self).__init__()
+                        self._fc = Linear(784, 10, act='softmax')

-                  @paddle.jit.to_static # If save for inference in dygraph, need this
+                    # If save for inference in dygraph, need this
+                    @paddle.jit.to_static
                    def forward(self, x):
                        y = self._fc(x)
                        return y
@@ -903,21 +904,18 @@ class Model(object):
                device = hapi.set_device('cpu')
                # if use static graph, do not set
                paddle.disable_static(device) if dynamic else None
-
                # inputs and labels are not required for dynamic graph.
                input = hapi.Input([None, 784], 'float32', 'x')
                label = hapi.Input([None, 1], 'int64', 'label')
-
                model = hapi.Model(Mnist(), input, label)
                optim = paddle.optimizer.SGD(learning_rate=1e-3,
                                            parameter_list=model.parameters())
-                model.prepare(optim,
-                                paddle.nn.CrossEntropyLoss(),
-                                hapi.metrics.Accuracy())
+                model.prepare(optim, paddle.nn.CrossEntropyLoss())
                mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
                model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
                model.save('checkpoint/test')  # save for training
                model.save('inference_model', False)  # save for inference
+
        """

        if ParallelEnv().local_rank == 0:
@@ -1534,47 +1532,6 @@ class Model(object):

        Returns:
            list: The fetch variables' name list
-
-        Examples:
-        .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.static import InputSpec
-
-            import paddle.incubate.hapi as hapi
-            from paddle.nn import Linear
-            from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
-
-            class Mnist(Layer):
-                def __init__(self, classifier_act=None):
-                    super(Mnist, self).__init__()
-
-                    self.fc = Linear(input_dim=784, output_dim=10, act="softmax")
-
-                @paddle.jit.to_static # In static mode, you need to delete this.
-                def forward(self, inputs):
-                    outputs = self.fc(inputs)
-                    return outputs
-
-            dynamic = True # False
-            device = hapi.set_device('gpu')
-
-            # if use static graph, do not set
-            paddle.disable_static(device) if dynamic else None
-
-            # inputs and labels are not required for dynamic graph.
-            input = InputSpec([None, 784], 'float32', 'x')
-            label = InputSpec([None, 1], 'int64', 'label')
-
-            model = hapi.Model(Mnist(), input, label)
-            optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                parameter_list=model.parameters())
-            model.prepare(optim,
-                            paddle.nn.CrossEntropyLoss(),
-                            hapi.metrics.Accuracy())
-            mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
-            model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
-            model.save_inference_model('inference_model')
        """

        def get_inout_spec(all_vars, return_name=False):
@@ -1592,8 +1549,8 @@ class Model(object):
        #    the inputs of the model in running.
        # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
        if fluid.in_dygraph_mode():
+            with fluid.framework._dygraph_guard(None):
                layer = self.network
-            fluid.disable_dygraph()

                # 1. input check
                prog_translator = ProgramTranslator()
@@ -1631,7 +1588,8 @@ class Model(object):
                    if param_or_buffer.name in state_names_dict:
                        extra_info_dict['structured_name'] = state_names_dict[
                            param_or_buffer.name]
-                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
+                    extra_info_dict[
+                        'stop_gradient'] = param_or_buffer.stop_gradient
                    if isinstance(param_or_buffer, ParamBase):
                        extra_info_dict['trainable'] = param_or_buffer.trainable
                    extra_var_info[param_or_buffer.name] = extra_info_dict

--- a/python/paddle/incubate/hapi/tests/test_transforms.py
+++ b/python/paddle/incubate/hapi/tests/test_transforms.py
@@ -64,6 +64,11 @@ class TestTransforms(unittest.TestCase):

        self.do_transform(trans)

+    def test_normalize(self):
+        normalize = transforms.Normalize(mean=0.5, std=0.5)
+        trans = transforms.Compose([transforms.Permute(mode='CHW'), normalize])
+        self.do_transform(trans)
+
    def test_trans_resize(self):
        trans = transforms.Compose([
            transforms.Resize(300, [0, 1]),
@@ -165,7 +170,7 @@ class TestTransforms(unittest.TestCase):
        fake_img = np.random.rand(500, 400, 3).astype('float32')
        fake_img_gray = trans_gray(fake_img)

-        np.testing.assert_equal(len(fake_img_gray.shape), 2)
+        np.testing.assert_equal(len(fake_img_gray.shape), 3)
        np.testing.assert_equal(fake_img_gray.shape[0], 500)
        np.testing.assert_equal(fake_img_gray.shape[1], 400)


--- a/python/paddle/incubate/hapi/vision/transforms/functional.py
+++ b/python/paddle/incubate/hapi/vision/transforms/functional.py
@@ -16,6 +16,7 @@ import sys
 import collections
 import random
 import math
+import functools

 import cv2
 import numbers
@@ -31,6 +32,23 @@ else:
 __all__ = ['flip', 'resize', 'pad', 'rotate', 'to_grayscale']


+def keepdims(func):
+    """Keep the dimension of input images unchanged"""
+
+    @functools.wraps(func)
+    def wrapper(image, *args, **kwargs):
+        if len(image.shape) != 3:
+            raise ValueError("Expect image have 3 dims, but got {} dims".format(
+                len(image.shape)))
+        ret = func(image, *args, **kwargs)
+        if len(ret.shape) == 2:
+            ret = ret[:, :, np.newaxis]
+        return ret
+
+    return wrapper
+
+
+@keepdims
 def flip(image, code):
    """
    Accordding to the code (the type of flip), flip the input image
@@ -62,6 +80,7 @@ def flip(image, code):
    return cv2.flip(image, flipCode=code)


+@keepdims
 def resize(img, size, interpolation=cv2.INTER_LINEAR):
    """
    resize the input data to given size
@@ -103,6 +122,7 @@ def resize(img, size, interpolation=cv2.INTER_LINEAR):
        return cv2.resize(img, size[::-1], interpolation=interpolation)


+@keepdims
 def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
    """Pads the given CV Image on all sides with speficified padding mode and fill value.

@@ -193,6 +213,7 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
    return img


+@keepdims
 def rotate(img,
           angle,
           interpolation=cv2.INTER_LINEAR,
@@ -266,6 +287,7 @@ def rotate(img,
    return dst.astype(dtype)


+@keepdims
 def to_grayscale(img, num_output_channels=1):
    """Converts image to grayscale version of image.


--- a/python/paddle/incubate/hapi/vision/transforms/transforms.py
+++ b/python/paddle/incubate/hapi/vision/transforms/transforms.py
@@ -505,7 +505,7 @@ class Normalize(object):
            mean = [mean, mean, mean]

        if isinstance(std, numbers.Number):
-            mean = [std, std, std]
+            std = [std, std, std]

        self.mean = np.array(mean, dtype=np.float32).reshape(len(mean), 1, 1)
        self.std = np.array(std, dtype=np.float32).reshape(len(std), 1, 1)

--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -97,8 +97,20 @@ from .layer.common import Dropout  #DEFINE_ALIAS
 from .layer.common import Dropout2D  #DEFINE_ALIAS
 from .layer.common import Dropout3D  #DEFINE_ALIAS
 from .layer.common import AlphaDropout  #DEFINE_ALIAS
+
+from .layer.pooling import AvgPool1d  #DEFINE_ALIAS
+from .layer.pooling import AvgPool2d  #DEFINE_ALIAS
+from .layer.pooling import AvgPool3d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool1d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool2d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool3d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
 from .layer.pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
 from .layer.pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+
+from .layer.pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveMaxPool2d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveMaxPool3d  #DEFINE_ALIAS
 from .layer.conv import Conv1d  #DEFINE_ALIAS
 from .layer.conv import Conv2d  #DEFINE_ALIAS
 from .layer.conv import Conv3d  #DEFINE_ALIAS

--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -170,22 +170,28 @@ from .norm import layer_norm  #DEFINE_ALIAS
 from .norm import lrn  #DEFINE_ALIAS
 from .norm import normalize  #DEFINE_ALIAS
 # from .norm import spectral_norm        #DEFINE_ALIAS
-from .pooling import max_pool1d  #DEFINE_ALIAS
-from .pooling import avg_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
 from .pooling import pool2d  #DEFINE_ALIAS
 from .pooling import pool3d  #DEFINE_ALIAS
+from .pooling import avg_pool1d  #DEFINE_ALIAS
 from .pooling import adaptive_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_pool3d  #DEFINE_ALIAS
-from .rnn import rnn  #DEFINE_ALIAS
-from .rnn import birnn  #DEFINE_ALIAS
 from .pooling import avg_pool2d  #DEFINE_ALIAS
-from .pooling import max_pool2d  #DEFINE_ALIAS
 from .pooling import avg_pool3d  #DEFINE_ALIAS
+from .pooling import max_pool1d  #DEFINE_ALIAS
+from .pooling import max_pool2d  #DEFINE_ALIAS
 from .pooling import max_pool3d  #DEFINE_ALIAS
+
+from .pooling import adaptive_pool2d  #DEFINE_ALIAS
+from .pooling import adaptive_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool2d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
 from .pooling import adaptive_avg_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_avg_pool3d  #DEFINE_ALIAS
+
+from .rnn import rnn  #DEFINE_ALIAS
+from .rnn import birnn  #DEFINE_ALIAS
 # from .rnn import gru_unit        #DEFINE_ALIAS
 # from .rnn import lstm        #DEFINE_ALIAS
 # from .rnn import lstm_unit        #DEFINE_ALIAS

--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -158,7 +158,7 @@ def conv1d(x,
        bias (Tensor, optional): The bias with shape [M,]. Default: None.
        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
            contain one integers, (stride_size). Default: 1.
-        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+        padding(int|str|tuple|list, optional): The padding size. Padding could be in one of the following forms.
            1. a string in ['valid', 'same'].
            2. an int, which means the feature map is zero paded by size of `padding` on both sides.
            3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
@@ -185,7 +185,7 @@ def conv1d(x,
        same with input.

    Raises:
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
        ValueError: If `data_format` is not "NCL" or "NLC".
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
@@ -238,7 +238,7 @@ def conv1d(x,
    num_channels = x.shape[channel_dim]
    num_filters = weight.shape[0]
    if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                         "should be defined. Received: {}.".format(
                             x.shape, num_channels))
    if num_channels % groups != 0:
@@ -260,7 +260,7 @@ def conv1d(x,
        padding = padding + [0]
    else:
        raise ValueError(
-            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            "The size of padding's dimension should be 1 or 2. But got padding={}".
            format(padding))

    stride = utils.convert_to_list(stride, 1, 'stride') + [1]
@@ -424,7 +424,7 @@ def conv2d(x,

    Raises:
        ValueError: If `data_format` is not "NCHW" or "NHWC".
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
            or the element corresponding to the input's channel is not 0.
@@ -465,7 +465,7 @@ def conv2d(x,
    num_channels = x.shape[channel_dim]
    num_filters = weight.shape[0]
    if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                         "should be defined. Received: {}.".format(
                             x.shape, num_channels))
    if num_channels % groups != 0:
@@ -710,7 +710,7 @@ def conv_transpose1d(x,

    num_channels = x.shape[channel_dim]
    if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                         "should be defined. Received: {}.".format(
                             x.shape, num_channels))
    if num_channels % groups != 0:
@@ -728,7 +728,7 @@ def conv_transpose1d(x,
        padding = padding + [0]
    else:
        raise ValueError(
-            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            "The size of padding's dimension should 1 or 2. But got padding={}".
            format(padding))

    stride = utils.convert_to_list(stride, 1, 'stride') + [1]
@@ -807,10 +807,10 @@ def conv_transpose2d(x,
                     stride=1,
                     padding=0,
                     output_padding=0,
-                     groups=1,
                     dilation=1,
-                     data_format='NCHW',
+                     groups=1,
                     output_size=None,
+                     data_format='NCHW',
                     name=None):
    """

@@ -883,28 +883,27 @@ def conv_transpose2d(x,
        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
-             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
-             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
-             If `padding` is a tuple or list, it could be in three forms:
-             `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and
-            when `data_format` is `'NCHW'`,
-            `padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `'NHWC'`, `padding` can be in the form
+        padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or 
+            'SAME' which is the padding algorithm. If padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or 
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCHW"`, `pool_padding` can be in the form 
+            `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form 
            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
            Default: padding = 0.
        output_padding(int|list|tuple, optional): Additional size added to one side
            of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
-            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
            when group=2, the first half of the filters is only connected to the
            first half of the input channels, while the second half of the
            filters is only connected to the second half of the input channels.
            Default: groups = 1.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
+            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
        output_size(int|tuple|list, optional): The output image size. If output size is a
            tuple, it must contain two integers, (image_height, image_width). None if use
            filter_size, padding, and stride to calculate output_size.
@@ -950,7 +949,7 @@ def conv_transpose2d(x,
          paddle.disable_static()
          x_var = paddle.to_tensor(x)
          w_var = paddle.to_tensor(w)
-          y_var = F.conv2d_transpose(x_var, w_var)
+          y_var = F.conv_transpose2d(x_var, w_var)
          y_np = y_var.numpy()
          print(y_np.shape)

@@ -966,7 +965,7 @@ def conv_transpose2d(x,
    channel_dim = -1 if channel_last else 1
    num_channels = x.shape[channel_dim]
    if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                         "should be defined. Received: {}.".format(
                             x.shape, num_channels))
    if num_channels % groups != 0:
@@ -1147,7 +1146,7 @@ def conv3d(x,

    Raises:
        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
            or the element corresponding to the input's channel is not 0.
@@ -1160,19 +1159,17 @@ def conv3d(x,
    Examples:
        .. code-block:: python

-            from paddle import fluid
-            import paddle.nn.functional as F
-            import paddle.fluid.dygraph as dg
            import numpy as np
+            import paddle
+            import paddle.nn.functional as F

            x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
            w = np.random.randn(6, 3, 3, 3, 3).astype(np.float32)

-            place = fluid.CPUPlace()
-            with dg.guard(place):
-                x_var = dg.to_variable(x)
-                w_var = dg.to_variable(w)
-                y_var = F.conv3d(x_var, w_var, act="relu")
+            paddle.disable_static()
+            x_var = paddle.to_tensor(x)
+            w_var = paddle.to_tensor(w)
+            y_var = F.conv3d(x_var, w_var)
            y_np = y_var.numpy()
            print(y_np.shape)

@@ -1190,7 +1187,7 @@ def conv3d(x,
    num_filters = weight.shape[0]
    if num_channels < 0:
        raise ValueError(
-            "The channel dimmention of the input({}) should be defined. "
+            "The channel dimension of the input({}) should be defined. "
            "Received: {}.".format(x.shape, num_channels))
    if num_channels % groups != 0:
        raise ValueError(
@@ -1260,8 +1257,8 @@ def conv_transpose3d(x,
                     output_padding=0,
                     groups=1,
                     dilation=1,
-                     data_format='NCDHW',
                     output_size=None,
+                     data_format='NCDHW',
                     name=None):
    """
    The convolution3d transpose layer calculates the output based on the input,
@@ -1338,37 +1335,37 @@ def conv_transpose3d(x,
            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
            Default: stride = 1.
-        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively
-             adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
-             either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
-             is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `'NCDHW'`, `padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `'NDHWC'`, `padding` can be in the form
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
            Default: padding = 0.
        output_padding(int|list|tuple, optional): Additional size added to one side
            of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
-            Default: dilation = 1.
        groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
            when group=2, the first half of the filters is only connected to the
            first half of the input channels, while the second half of the
            filters is only connected to the second half of the input channels.
            Default: groups=1
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
-            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            Default: dilation = 1.
        output_size(int|list|tuple, optional): The output image size. If output size is a
            tuple, it must contain three integers, (image_depth, image_height, image_width). This
            parameter only works when filter_size is None. If output_size and filter_size are 
            specified at the same time, They should follow the formula above. Default: None. 
            Output_size and filter_size should not be None at the same time.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
        name(str, optional): For detailed information, please refer 
           to :ref:`api_guide_Name`. Usually name is no need to set and 
           None by default.
@@ -1425,7 +1422,7 @@ def conv_transpose3d(x,
    num_filters = weight.shape[1]
    if num_channels < 0:
        raise ValueError(
-            "The channel dimmention of the input({}) should be defined. "
+            "The channel dimension of the input({}) should be defined. "
            "Received: {}.".format(x.shape, num_channels))
    if num_channels % groups != 0:
        raise ValueError(

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -784,30 +784,30 @@ def kl_div(input, label, reduction='mean', name=None):
            import numpy as np
            import paddle.nn.functional as F

-            paddle.enable_imperative()
+            paddle.disable_static()

            shape = (5, 20)
            input = np.random.uniform(-10, 10, shape).astype('float32')
            target = np.random.uniform(-10, 10, shape).astype('float32')

            # 'batchmean' reduction, loss shape will be [N]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='batchmean')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='batchmean')
            # shape=[5]

            # 'mean' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='mean')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='mean')
            # shape=[1]

            # 'sum' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='sum')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='sum')
            # shape=[1]

            # 'none' reduction, loss shape is same with input shape
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='none')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='none')
            # shape=[5, 20]

    """

--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -18,124 +18,146 @@ from ...fluid.layers import pool3d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool2d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool3d  #DEFINE_ALIAS
 from ...fluid import core
-from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
-from ...fluid.layers import utils, LayerHelper
-from ...fluid.data_feeder import check_type, check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from ...fluid.layers import unsqueeze, squeeze
+from ...fluid.framework import in_dygraph_mode
+from ...fluid.layers import utils, LayerHelper, unsqueeze, squeeze
+from ...fluid.data_feeder import check_type, check_variable_and_dtype

 __all__ = [
    'pool2d',
    'pool3d',
+    'adaptive_pool2d',
+    'adaptive_pool3d',
    'avg_pool1d',
+    'avg_pool2d',
+    'avg_pool3d',
    'max_pool1d',
+    'max_pool2d',
+    'max_pool3d',
    'adaptive_avg_pool1d',
-    'adaptive_max_pool1d',
    'adaptive_avg_pool2d',
    'adaptive_avg_pool3d',
-    'adaptive_pool2d',
-    'adaptive_pool3d',
-    'max_pool2d',
-    'avg_pool2d',
-    'max_pool3d',
-    'avg_pool3d',
+    'adaptive_max_pool1d',
+    'adaptive_max_pool2d',
+    'adaptive_max_pool3d',
 ]


-def check_input(x, dimension):
+def _is_list_or_tuple(input):
+    return isinstance(input, (list, tuple))
+
+
+def _check_input(x, dimension):
    if len(x.shape) != dimension:
-        raise ValueError("Excepted Input X is 3-D tensor, but received {}-D {}".
-                         format(len(x.shape), type(x)))
+        raise ValueError(
+            "Excepted Input X is {}-D tensor, but received {}-D {}".format(
+                dimension, len(x.shape), type(x)))


-def check_instance(x, x_name, types=(int, float)):
+def _check_instance(x, x_name, types=(int, float)):

    if not isinstance(x, types):
        raise ValueError("Excepted {} type for {} but received type: {}. ".
                         format(types, x_name, type(x)))


-def update_padding1d(padding, pool_type='avg'):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, list) or isinstance(ele, tuple):
-            return True
-        return False
-
-    if is_list_or_tuple(padding):
-        if padding.__len__() == 1 and not is_list_or_tuple(padding[0]):
-            return [0, padding[0]]
-        else:
-            raise ValueError(
-                "{}_pool1d() argument 'padding' should contain one int (got {})".
-                format(pool_type, padding.__len__()))
+def _zero_padding_in_batch_and_channel(padding, channel_last):
+    if channel_last:
+        return list(padding[0]) == [0, 0] and list(padding[-1]) == [0, 0]
    else:
-        padding = [0, padding]
+        return list(padding[0]) == [0, 0] and list(padding[1]) == [0, 0]

-    return padding

+def _exclude_padding_in_batch_and_channel(padding, channel_last):
+    padding_ = padding[1:-1] if channel_last else padding[2:]
+    padding_ = [elem for pad_a_dim in padding_ for elem in pad_a_dim]
+    return padding_

-def update_padding2d(padding, data_format):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, list) or isinstance(ele, tuple):
-            return True
-        return False

-    if is_list_or_tuple(padding) and len(padding) == 4:
-        if is_list_or_tuple(padding[0]) and (data_format == "NCHW"):
-            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+def _channel_last(data_format, num_dims):
+    if num_dims == 1:
+        if data_format not in ['NCL', 'NLC']:
            raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[2:4]
-            padding = [ele for a_list in padding for ele in a_list]
-        elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
-            if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
+                "Attr(data_format) should be 'NCL' or 'NLC'. Received "
+                "Attr(data_format): %s" % str(data_format))
+        else:
+            return True if data_format == "NLC" else False
+    if num_dims == 2:
+        if data_format not in ['NCHW', 'NHWC']:
            raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[1:3]
-            padding = [ele for a_list in padding for ele in a_list]
-        padding = utils.convert_to_list(padding, 4, 'padding')
-
-        if utils._is_symmetric_padding(padding, 2):
-            padding = [padding[0], padding[2]]
+                "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+                "Attr(data_format): %s" % str(data_format))
        else:
-        padding = utils.convert_to_list(padding, 2, 'padding')
-
-    return padding
-
+            return True if data_format == "NHWC" else False
+    if num_dims == 3:
+        if data_format not in ['NCDHW', 'NDHWC']:
+            raise ValueError(
+                "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+                "Attr(data_format): %s" % str(data_format))
+        else:
+            return True if data_format == "NDHWC" else False

-def update_padding3d(padding, data_format):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, (list, tuple)):
-            return True
-        return False

-    if is_list_or_tuple(padding) and len(padding) == 5:
-        if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"):
-            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
            raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[2:5]
-            padding = [ele for a_list in padding for ele in a_list]
-        elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
-            if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
+                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
+                format(padding))
+        if padding == "VALID":
+            if ceil_mode != False:
                raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[1:4]
-            padding = [ele for a_list in padding for ele in a_list]
-        padding = utils.convert_to_list(padding, 6, 'padding')
-        if utils._is_symmetric_padding(padding, 3):
-            padding = [padding[0], padding[2], padding[4]]
-
-    elif is_list_or_tuple(padding) and len(padding) == 6:
-        padding = utils.convert_to_list(padding, 6, 'padding')
-        if utils._is_symmetric_padding(padding, 3):
-            padding = [padding[0], padding[2], padding[4]]
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+
+            padding_algorithm = "VALID"
+            padding = [0] * num_dims
+        else:
+            padding_algorithm = "SAME"
+            padding = [0] * num_dims
+    elif _is_list_or_tuple(padding):
+        # for padding like
+        # [(pad_before, pad_after), (pad_before, pad_after), ...]
+        # padding for batch_dim and channel_dim included
+        if len(padding) == 2 + num_dims and _is_list_or_tuple(padding[0]):
+            if not _zero_padding_in_batch_and_channel(padding, channel_last):
+                raise ValueError(
+                    "Non-zero padding({}) in the batch or channel dimensions "
+                    "is not supported.".format(padding))
+            padding_algorithm = "EXPLICIT"
+            padding = _exclude_padding_in_batch_and_channel(padding,
+                                                            channel_last)
+            if utils._is_symmetric_padding(padding, num_dims):
+                padding = padding[0::2]
+        # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
+        elif len(padding) == 2 * num_dims and isinstance(padding[0], int):
+            padding_algorithm = "EXPLICIT"
+            padding = utils.convert_to_list(padding, 2 * num_dims, 'padding')
+            if utils._is_symmetric_padding(padding, num_dims):
+                padding = padding[0::2]
+        # for padding like [pad_d1, pad_d2, ...]
+        elif len(padding) == num_dims and isinstance(padding[0], int):
+            padding_algorithm = "EXPLICIT"
+            padding = utils.convert_to_list(padding, num_dims, 'padding')
+        else:
+            raise ValueError("Invalid padding: {}".format(padding))
+    # for integer padding
    else:
-        padding = utils.convert_to_list(padding, 3, 'padding')
+        padding_algorithm = "EXPLICIT"
+        padding = utils.convert_to_list(padding, num_dims, 'padding')
+    return padding, padding_algorithm
+

+def _expand_low_nd_padding(padding):
+    #1d to 2d fake input
+    if len(padding) == 2:
+        padding = [0] * 2 + padding
+    elif len(padding) == 1:
+        padding = [0] + padding
+    else:
+        raise ValueError(
+            "The size of padding's dimmention should be 1 or 2. But got padding={}".
+            format(padding))
    return padding


@@ -147,72 +169,56 @@ def avg_pool1d(x,
               ceil_mode=False,
               name=None):
    """ 
-
-    This operation applies a 1D average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
-
-    ..  math::
-
-       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
-
+    This API implements average pooling 1d operation,
+    See more details in :ref:`api_nn_pooling_AvgPool1d` .

    Args:
        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
                          shape [N, C, L]. where `N` is batch size, `C` is the number of channels,
-                          `L` is the length of the feature. The data type if float32 or float64.
+                          `L` is the length of the feature. The data type is float32 or float64.
        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+            it must contain an integer.
        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
-            then the input is implicitly zero-padded on both sides for padding number of points.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
+                          mode, default is `True`.
        ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
-            If it is set to False, the floor function will be used. Default False
+            If it is set to False, the floor function will be used. The default value is False.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
-
    Returns:
        Tensor: The output tensor of pooling result. The data type is same as input tensor.

    Raises:
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
+        ValueError: If `padding` is a list or tuple but its length is greater than 1.
+        ShapeError: If the input is not a 3-D tensor.
        ShapeError: If the output's shape calculated is not greater than 0.

-
    Examples:
-
        .. code-block:: python
-
          import paddle
          import paddle.nn.functional as F
          paddle.disable_static()
-
          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          pool_out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
-          # pool_out shape: [1, 3, 16]
-
+          out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # out shape: [1, 3, 16]
    """
    """NCL to NCHW"""
    data_format = "NCHW"
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'avg_pool1d')
-    check_input(x, 3)
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool1d')
+    _check_input(x, 3)
    x = unsqueeze(x, [2])
-    kernel_size = utils.convert_to_list(kernel_size, 1, 'pool_size')
+    kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
    kernel_size = [1] + kernel_size
    if stride is None:
        stride = kernel_size
@@ -220,33 +226,20 @@ def avg_pool1d(x,
        stride = utils.convert_to_list(stride, 1, 'pool_stride')
        stride = [1] + stride

-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0]
+    channel_last = _channel_last("NCL", 1)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, channel_last=channel_last, ceil_mode=ceil_mode)

-    padding = update_padding1d(padding, "avg")
+    # use 2d to implenment 1d should expand padding in advance.
+    padding = _expand_low_nd_padding(padding)

    if in_dygraph_mode():
        output = core.ops.pool2d(
            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
            False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', not count_include_pad, 'ceil_mode',
-            ceil_mode, 'use_mkldnn', False, 'exclusive', True, 'data_format',
-            data_format)
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
        return squeeze(output, [2])

    op_type = 'pool2d'
@@ -275,126 +268,103 @@ def avg_pool1d(x,
    return squeeze(pool_out, [2])


-def max_pool1d(x,
+def avg_pool2d(x,
               kernel_size,
               stride=None,
               padding=0,
-               return_indices=False,
               ceil_mode=False,
+               count_include_pad=True,
+               divisor_override=None,
+               data_format="NCHW",
               name=None):
    """
-
-    Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
-
-    ..  math::
-
-       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+    This API implements average pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_AvgPool2d` .
 
    Args:
-        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
-                          shape [N, C, L], where `N` is batch size, `C` is the number of channels,
-                          `L` is the length of the feature. The data type if float32 or float64.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
-            If it is set to False, the floor function will be used. Default False.
+        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If it is a tuple or list,
+            it must contain two integers, (kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The stride size. If it is a tuple or list,
+            it must contain two integers, (stride_Height, stride_Width).
+            Otherwise, the stride size will be a square of an int.
+
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
-
    Returns:
        Tensor: The output tensor of pooling result. The data type is same as input tensor.
-
    Raises:
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
        ShapeError: If the output's shape calculated is not greater than 0.
-
-
    Examples:
-
        .. code-block:: python
-
          import paddle
          import paddle.nn.functional as F
+          import numpy as np
          paddle.disable_static()
-
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
-          # pool_out shape: [1, 3, 16]
-
-          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
-          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
-
+          # avg pool2d
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          out = F.avg_pool2d(x,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          # out.shape [1, 3, 16, 16]
    """
-    """NCL to NCHW"""
-    data_format = "NCHW"
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'max_pool1d')
-    check_input(x, 3)
-    x = unsqueeze(x, [2])
-    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
    if stride is None:
        stride = kernel_size
    else:
-        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
-
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0]
+        stride = utils.convert_to_list(stride, 2, 'pool_stride')

-    padding = update_padding1d(padding, 'max')
+    channel_last = _channel_last(data_format, 2)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 2, channel_last, ceil_mode=ceil_mode)

    if in_dygraph_mode():
-        pool_out = core.ops.max_pool2d_with_index(
-            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
-            'paddings', padding, 'padding_algorithm', padding_algorithm,
-            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
-            'exclusive', True, 'data_format', data_format)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+        output = core.ops.pool2d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
+            False, 'padding_algorithm', padding_algorithm, 'strides', stride,
+            'paddings', padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            _check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1]) / divisor_override

-    op_type = 'max_pool2d_with_index'
+    op_type = 'pool2d'
    helper = LayerHelper(op_type, **locals())
    dtype = helper.input_dtype()
    pool_out = helper.create_variable_for_type_inference(dtype)
-    mask = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out, "Mask": mask}

    helper.append_op(
        type=op_type,
        inputs={"X": x},
-        outputs=outputs,
+        outputs={"Out": pool_out},
        attrs={
-            "pooling_type": 'max',
+            "pooling_type": "avg",
            "ksize": kernel_size,
            "global_pooling": False,
            "strides": stride,
@@ -403,335 +373,211 @@ def max_pool1d(x,
            "use_cudnn": True,
            "ceil_mode": ceil_mode,
            "use_mkldnn": False,
-            "exclusive": True,
+            "exclusive": not count_include_pad,
            "data_format": data_format,
        })

-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+    if divisor_override is None:
+        return pool_out
+    else:
+        _check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override


-def adaptive_avg_pool1d(x, output_size, name=None):
+def avg_pool3d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               ceil_mode=False,
+               count_include_pad=False,
+               divisor_override=None,
+               data_format="NCDHW",
+               name=None):
    """
-
-    This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For average adaptive pool1d:
-
-    ..  math::
-
-        lstart &= floor(i * L_{in} / L_{out})
-
-        lend &= ceil((i + 1) * L_{in} / L_{out})
-
-        Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+    This API implements average pooling 3d operation.
+    See more details in :ref:`api_nn_pooling_AvgPool3d` .

    Args:
-        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
-                              with shape [N, C, L].  The format of input tensor is NCL,
-                              where N is batch size, C is the number of channels, L is the
-                              length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
+        x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
+                          shape [N, C, D, H, W], where `N` represents the batch size, `C` represents
+                          the number of channels, `D`, `H` and `W` represent the depth, height and width of the feature respectively.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
-
    Returns:
-            Tensor: The output tensor of adaptive average pooling result. The data type is same
-                      as input tensor.
-
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
    Raises:
-            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
-
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
    Examples:
        .. code-block:: python
-
-              # average adaptive pool1d
-              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-              # output shape is [N, C, m], adaptive pool divide L dimension
-              # of input data into m grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(m):
-              #         lstart = floor(i * L / m)
-              #         lend = ceil((i + 1) * L / m)
-              #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-              #
+          import paddle.fluid as fluid
          import paddle
-              import paddle.nn.functional as F
-              paddle.disable_static()
-
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-              pool_out = F.adaptive_average_pool1d(data, output_size=16)
-              # pool_out shape: [1, 3, 16])
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          # avg pool3d
+          out = paddle.nn.functional.avg_pool3d(
+                                            x,
+                                            kernel_size = 2,
+                                            stride = 2,
+                                            padding=0)
+          # out.shape: [1, 3, 16, 16, 16]
    """
-    pool_type = 'avg'
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
-                             'adaptive_pool2d')
-    check_input(x, 3)
-    check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')

-    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+    channel_last = _channel_last(data_format, 3)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)

-    l_type = "pool2d"
-    x = unsqueeze(x, [2])
    if in_dygraph_mode():
-        pool_out = core.ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
-                                   pool_size, 'adaptive', True)
-        return squeeze(pool_out, [2])
+        output = core.ops.pool3d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
+            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            _check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1] *
+                             kernel_size[2]) / divisor_override

-    helper = LayerHelper(l_type, **locals())
+    op_type = "pool3d"
+    helper = LayerHelper(op_type, **locals())
    dtype = helper.input_dtype()
    pool_out = helper.create_variable_for_type_inference(dtype)
-
    outputs = {"Out": pool_out}
-    helper.append_op(
-        type=l_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
-        })
-
-    return squeeze(pool_out, [2])
-
-
-def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
-    """
-    This operation applies a 1D adaptive max pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For max adaptive pool1d:
-
-    ..  math::
-
-        lstart &= floor(i * L_{in} / L_{out})
-
-        lend &= ceil((i + 1) * L_{in} / L_{out})
-
-        Output(i) &= max(Input[lstart:lend])}
-
-    Args:
-        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
-                              with shape [N, C, L].  The format of input tensor is NCL,
-                              where N is batch size, C is the number of channels, L is the
-                              length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
-        return_indices (bool): If true, the index of max pooling point will be returned along
-                with outputs. It cannot be set in average pooling type. Default False.
-        name(str, optional): For detailed information, please refer
-                                 to :ref:`api_guide_Name`. Usually name is no need to set and
-                                 None by default.
-
-    Returns:
-            Tensor: The output tensor of adaptive pooling result. The data type is same
-                      as input tensor.
-
-    Raises:
-            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
-
-    Examples:
-        .. code-block:: python
-
-              # max adaptive pool1d
-              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-              # output shape is [N, C, m], adaptive pool divide L dimension
-              # of input data into m grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(m):
-              #         lstart = floor(i * L / m)
-              #         lend = ceil((i + 1) * L / m)
-              #         output[:, :, i] = max(input[:, :, lstart: lend])
-              #
-              import paddle
-              import paddle.nn.functional as F
-              paddle.disable_static()
-
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-              pool_out = F.adaptive_max_pool1d(data, output_size=16)
-              # pool_out shape: [1, 3, 16])
-
-              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
-              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
-
-    """
-    pool_type = 'max'
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
-                             'adaptive_max_pool1d')
-    check_input(x, 3)
-    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
-    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
-
-    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
-
-    l_type = 'max_pool2d_with_index'
-
-    x = unsqueeze(x, [2])
-    if in_dygraph_mode():
-        pool_out = core.ops.max_pool2d_with_index(
-            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
-
-    helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-
-    mask = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out, "Mask": mask}

    helper.append_op(
-        type=l_type,
+        type=op_type,
        inputs={"X": x},
        outputs=outputs,
        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
        })

-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+    if divisor_override is None:
+        return pool_out
+    else:
+        _check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1] *
+                           kernel_size[2]) / divisor_override


-def max_pool2d(x,
+def max_pool1d(x,
               kernel_size,
               stride=None,
               padding=0,
               return_indices=False,
               ceil_mode=False,
-               data_format="NCHW",
               name=None):
    """
-    This operation applies 2D max pooling over input feature based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-           stride: stride
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
-                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                                                   \text{stride[1]} \times w + n)
-           $$
+    This API implements max pooling 1d opereation.
+    See more details in :ref:`api_nn_pooling_MaxPool1d` .

-    Args:
-        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
-                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
-                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
-                          `H` is the height of the feature, and `W` is the width of the
-                          feature. The data type if float32 or float64.
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
+                          shape [N, C, L], where `N` is batch size, `C` is the number of channels,
+                          `L` is the length of the feature. The data type if float32 or float64.
        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int.
+            it must contain an integer.
        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An integer, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
-
    Returns:
        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
    Raises:
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the input is not a 3-D tensor.
        ShapeError: If the output's shape calculated is not greater than 0.
+
    Examples:
        .. code-block:: python
          import paddle
          import paddle.nn.functional as F
-          import numpy as np
          paddle.disable_static()
-
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          output = F.max_pool2d(input,
-                                kernel_size=2,
-                                stride=2, padding=0)
-          # output.shape [1, 3, 16, 16]
-
-          # for return_indices=True
-          output, max_indices = F.max_pool2d(input,
-                                             kernel_size=2,
-                                             stride=2,
-                                             padding=0,
-                                             return_indices=True)
-          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # pool_out shape: [1, 3, 16]
+          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
+          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
    """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
-    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
+    """NCL to NCHW"""
+    data_format = "NCHW"
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool1d')
+    _check_input(x, 3)
+    x = unsqueeze(x, [2])
+    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
    if stride is None:
        stride = kernel_size
    else:
-        stride = utils.convert_to_list(stride, 2, 'pool_stride')
+        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')

-    if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0]
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, ceil_mode=ceil_mode)

-    padding = update_padding2d(padding, data_format)
+    # use 2d to implenment 1d should expand padding in advance.
+    padding = _expand_low_nd_padding(padding)

    if in_dygraph_mode():
-        output = core.ops.max_pool2d_with_index(
+        pool_out = core.ops.max_pool2d_with_index(
            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
            'paddings', padding, 'padding_algorithm', padding_algorithm,
            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
            'exclusive', True, 'data_format', data_format)
-        return output if return_indices else output[0]
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])

    op_type = 'max_pool2d_with_index'
    helper = LayerHelper(op_type, **locals())
@@ -758,36 +604,21 @@ def max_pool2d(x,
            "data_format": data_format,
        })

-    return (pool_out, mask) if return_indices else pool_out
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])


-def avg_pool2d(x,
+def max_pool2d(x,
               kernel_size,
               stride=None,
               padding=0,
+               return_indices=False,
               ceil_mode=False,
-               count_include_pad=True,
-               divisor_override=None,
               data_format="NCHW",
               name=None):
    """
-    This operation applies 2D average pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
-           $$
+    This API implements max pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_MaxPool2d` .

    Args:
        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
@@ -796,30 +627,26 @@ def avg_pool2d(x,
                          `H` is the height of the feature, and `W` is the width of the
                          feature. The data type if float32 or float64.
        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
+            it must contain two integers, (kernel_size_Height, kernel_size_Width).
            Otherwise, the pool kernel size will be a square of an int.
        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            it must contain two integers, (stride_Height, stride_Width).
            Otherwise, the pool stride size will be a square of an int.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                        `[batch_size, input_channels, input_height, input_width]`.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
-
    Returns:
        Tensor: The output tensor of pooling result. The data type is same as input tensor.
    Raises:
@@ -832,87 +659,71 @@ def avg_pool2d(x,
          import paddle.nn.functional as F
          import numpy as np
          paddle.disable_static()
-
-          # avg pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          output = F.avg_pool2d(input,
+          # max pool2d
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          out = F.max_pool2d(x,
                                kernel_size=2,
                                stride=2, padding=0)
          # output.shape [1, 3, 16, 16]
-
+          # for return_indices=True
+          out, max_indices = F.max_pool2d(x,
+                                             kernel_size=2,
+                                             stride=2,
+                                             padding=0,
+                                             return_indices=True)
+          # out.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
    """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
    if stride is None:
        stride = kernel_size
    else:
        stride = utils.convert_to_list(stride, 2, 'pool_stride')

-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0]
-
    if data_format not in ["NCHW", "NHWC"]:
        raise ValueError(
            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
            "Attr(data_format): %s." % str(data_format))
-    pool_padding = update_padding2d(padding, data_format)
+
+    channel_last = True if data_format == "NHWC" else False
+
+    padding, padding_algorithm = _update_padding_nd(
+        padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode)

    if in_dygraph_mode():
-        output = core.ops.pool2d(
-            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
-            False, 'padding_algorithm', padding_algorithm, 'strides', stride,
-            'paddings', pool_padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
-        if divisor_override is None:
-            return output
-        else:
-            check_instance(divisor_override, "divisor_override")
-            return output * (kernel_size[0] * kernel_size[1]) / divisor_override
+        output = core.ops.max_pool2d_with_index(
+            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
+            'paddings', padding, 'padding_algorithm', padding_algorithm,
+            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
+            'exclusive', True, 'data_format', data_format)
+        return output if return_indices else output[0]

-    op_type = 'pool2d'
+    op_type = 'max_pool2d_with_index'
    helper = LayerHelper(op_type, **locals())
    dtype = helper.input_dtype()
    pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}

    helper.append_op(
        type=op_type,
        inputs={"X": x},
-        outputs={"Out": pool_out},
+        outputs=outputs,
        attrs={
-            "pooling_type": "avg",
+            "pooling_type": 'max',
            "ksize": kernel_size,
            "global_pooling": False,
            "strides": stride,
-            "paddings": pool_padding,
+            "paddings": padding,
            "padding_algorithm": padding_algorithm,
            "use_cudnn": True,
            "ceil_mode": ceil_mode,
            "use_mkldnn": False,
-            "exclusive": not count_include_pad,
+            "exclusive": True,
            "data_format": data_format,
        })

-    if divisor_override is None:
-        return pool_out
-    else:
-        check_instance(divisor_override, "divisor_override")
-        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
+    return (pool_out, mask) if return_indices else pool_out


 def max_pool3d(x,
@@ -924,47 +735,25 @@ def max_pool3d(x,
               data_format="NCDHW",
               name=None):
    """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, D_{in}, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-           $$
-           \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, ksize[0]-1} \max_{m=0, \ldots, ksize[1]-1} \max_{n=0, \ldots, ksize[2]-1} \\
-                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
-                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
-           $$
-
+    This API implements max pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_MaxPool3d` .
    Args:
        x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of
-                          input tensor is `"NCDHW"` or `"NDHWC"`, where `N` is batch size, `C` is
-                          the number of channels, `D` is the depth of the feature,
-                          `H` is the height of the feature, and `W` is the width
-                          of the feature.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` or `"NDHWC"`, where N represents batch size, C represents the number of channels, D, H and W represent the depth, height and width of the feature respectively. 
+        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
            is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
            Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
            Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
        ceil_mode (bool): ${ceil_mode_comment}
        return_indices (bool): Whether to return the max indices along with the outputs.
        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
@@ -973,7 +762,6 @@ def max_pool3d(x,
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
-
    Returns:
        Tensor: The output tensor of pooling result. The data type is same as input tensor.
    Raises:
@@ -986,23 +774,20 @@ def max_pool3d(x,
          import paddle.nn.functional as F
          import numpy as np
          paddle.disable_static()
-
          # max pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          output = F.max_pool2d(input,
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output = F.max_pool2d(x,
                                kernel_size=2,
                                stride=2, padding=0)
          output.shape [1, 3, 16, 16, 16]
-
          # for return_indices=True
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          output, max_indices = paddle.nn.functional.max_pool3d(input,
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output, max_indices = paddle.nn.functional.max_pool3d(x,
                                        kernel_size = 2,
                                        stride = 2,
                                        padding=0,
                                        return_indices=True)
          # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
-
    """
    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
@@ -1011,29 +796,10 @@ def max_pool3d(x,
    else:
        stride = utils.convert_to_list(stride, 3, 'pool_stride')

-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0, 0]
+    channel_last = _channel_last(data_format, 3)

-    if data_format not in ["NCDHW", "NDHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s" % str(data_format))
-    padding = update_padding3d(padding, data_format)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)

    if in_dygraph_mode():
        output = core.ops.max_pool3d_with_index(
@@ -1071,170 +837,83 @@ def max_pool3d(x,
    return (pool_out, mask) if return_indices else pool_out


-def avg_pool3d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               ceil_mode=False,
-               count_include_pad=False,
-               divisor_override=None,
-               data_format="NCDHW",
-               name=None):
+def adaptive_avg_pool1d(x, output_size, name=None):
    """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+    This API implements adaptive average pooling 1d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
    
    Args:
-        input (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
-                          shape [N, C, D, H, W], where `N` is batch size, `C` is
-                          the number of channels, `D` is the depth of the feature,
-                          `H` is the height of the feature, and `W` is the width
-                          of the feature.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
-            is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
-                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
        name(str, optional): For detailed information, please refer
                                 to :ref:`api_guide_Name`. Usually name is no need to set and
                                 None by default.
-
-
    Returns:
-        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+            Tensor: The output tensor of adaptive average pooling result. The data type is same
+                      as input tensor.
    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
+            ValueError: 'output_size' should be an integer or list or tuple with length as 1.
    Examples:
        .. code-block:: python
-          import paddle.fluid as fluid
+              # average adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+              #
              import paddle
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          # avg pool3d
-          pool3d = paddle.nn.functional.avg_pool3d(
-                                            input,
-                                            kernel_size = 2,
-                                            stride = 2,
-                                            padding=0)
-          # pool3d.shape: [1, 3, 16, 16, 16]
+              import paddle.nn.functional as F
+              paddle.disable_static()
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_average_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
    """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
-    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
-    if stride is None:
-        stride = kernel_size
-    else:
-        stride = utils.convert_to_list(stride, 3, 'pool_stride')
-
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0, 0]
+    pool_type = 'avg'
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'adaptive_pool2d')
+    _check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')

-    if data_format not in ["NCDHW", "NDHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s" % str(data_format))
-    padding = update_padding3d(padding, data_format)
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')

+    l_type = "pool2d"
+    x = unsqueeze(x, [2])
    if in_dygraph_mode():
-        output = core.ops.pool3d(
-            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
-            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
-        if divisor_override is None:
-            return output
-        else:
-            check_instance(divisor_override, "divisor_override")
-            return output * (kernel_size[0] * kernel_size[1] *
-                             kernel_size[2]) / divisor_override
+        pool_out = core.ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
+                                   pool_size, 'adaptive', True)
+        return squeeze(pool_out, [2])

-    op_type = "pool3d"
-    helper = LayerHelper(op_type, **locals())
+    helper = LayerHelper(l_type, **locals())
    dtype = helper.input_dtype()
    pool_out = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out}

+    outputs = {"Out": pool_out}
    helper.append_op(
-        type=op_type,
+        type=l_type,
        inputs={"X": x},
        outputs=outputs,
        attrs={
-            "pooling_type": 'avg',
-            "ksize": kernel_size,
-            "global_pooling": False,
-            "strides": stride,
-            "paddings": padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": not count_include_pad,
-            "data_format": data_format,
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
        })

-    if divisor_override is None:
-        return pool_out
-    else:
-        check_instance(divisor_override, "divisor_override")
-        return pool_out * (kernel_size[0] * kernel_size[1] *
-                           kernel_size[2]) / divisor_override
+    return squeeze(pool_out, [2])


 def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
    """
-
-    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .
-
-    For avg adaptive pool2d:
-
-    ..  math::
-
-       hstart &= floor(i * H_{in} / H_{out})
-
-       hend &= ceil((i + 1) * H_{in} / H_{out})
-
-       wstart &= floor(j * W_{in} / W_{out})
-
-       wend &= ceil((j + 1) * W_{in} / W_{out})
-
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+    This API implements adaptive average pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .

    Args:
        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor.
@@ -1248,16 +927,12 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
-
    Returns:
        Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
-
    Raises:
        ValueError: If `data_format` is not "NCHW" or "NHWC".
-
    Examples:
        .. code-block:: python
-
            # adaptive avg pool2d
            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
@@ -1279,10 +954,10 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
            input_data = np.random.rand(2, 3, 32, 32)
            x = paddle.to_tensor(input_data)
            # x.shape is [2, 3, 32, 32]
-            pool_out = paddle.nn.functional.adaptive_avg_pool2d(
+            out = paddle.nn.functional.adaptive_avg_pool2d(
                            x = x,
                            output_size=[3, 3])
-            # pool_out.shape is [2, 3, 3, 3]
+            # out.shape is [2, 3, 3, 3]
    """
    if not in_dygraph_mode():
        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
@@ -1337,28 +1012,8 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):

 def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
    """
-
-    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .
-
-    For avg adaptive pool3d:
-
-    ..  math::
-
-      dstart &= floor(i * D_{in} / D_{out})
-
-      dend &= ceil((i + 1) * D_{in} / D_{out})
-
-      hstart &= floor(j * H_{in} / H_{out})
-
-      hend &= ceil((j + 1) * H_{in} / H_{out})
-
-      wstart &= floor(k * W_{in} / W_{out})
-
-      wend &= ceil((k + 1) * W_{in} / W_{out})
-
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+    This API implements adaptive average pooling 3d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .

    Args:
        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
@@ -1372,16 +1027,12 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
-
    Returns:
        Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
-
    Raises:
        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-
    Examples:
        .. code-block:: python
-
            # adaptive avg pool3d
            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
@@ -1406,10 +1057,10 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
            input_data = np.random.rand(2, 3, 8, 32, 32)
            x = paddle.to_tensor(input_data)
            # x.shape is [2, 3, 8, 32, 32]
-            pool_out = paddle.nn.functional.adaptive_avg_pool3d(
+            out = paddle.nn.functional.adaptive_avg_pool3d(
                            x = x,
                            output_size=[3, 3, 3])
-            # pool_out.shape is [2, 3, 3, 3, 3]
+            # out.shape is [2, 3, 3, 3, 3]
    """
    if not in_dygraph_mode():
        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
@@ -1461,3 +1112,257 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
        })

    return pool_out
+
+
+def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+    """
+    This API implements adaptive max pooling 1d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveMaxPool1d` .
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+                with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
+    Returns:
+            Tensor: The output tensor of adaptive pooling result. The data type is same
+                      as input tensor.
+    Raises:
+            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+    Examples:
+        .. code-block:: python
+              # max adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = max(input[:, :, lstart: lend])
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_max_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
+              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
+    """
+    pool_type = 'max'
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                             'adaptive_max_pool1d')
+    _check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
+
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+
+    l_type = 'max_pool2d_with_index'
+
+    x = unsqueeze(x, [2])
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+
+
+def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
+    """
+        This operation applies a 2D adaptive max pooling on input tensor.
+        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
+        Args:
+            x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
+            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
+            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+        Returns:
+            Tensor: The output tensor of adaptive max pool2d result. The data type is same as input tensor.
+        Examples:
+            .. code-block:: python
+              # max adaptive pool2d
+              # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
+              # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+              # of input data into m*n grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         for j in range(n):
+              #             hstart = floor(i * H / m)
+              #             hend = ceil((i + 1) * H / m)
+              #             wstart = floor(i * W / n)
+              #             wend = ceil((i + 1) * W / n)
+              #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+              #
+              import paddle
+              import numpy as np
+              paddle.disable_static()
+              input_data = np.random.rand(2, 3, 32, 32)
+              x = paddle.to_tensor(input_data)
+              # x.shape is [2, 3, 32, 32]
+              out = paddle.nn.functional.adaptive_max_pool2d(
+                            x = x,
+                            output_size=[3, 3])
+              # out.shape is [2, 3, 3, 3]
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_max_pool2d')
+    _check_input(x, 4)
+    #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool2d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool2d')
+
+    in_h, in_w = x.shape[2:4]
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 2, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_h
+        if output_size[1] == None:
+            output_size[1] = in_w
+
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
+        return pool_out if return_indices else pool_out[0]
+
+    l_type = 'max_pool2d_with_index'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        })
+    #return (pool_out, mask) if return_indices else pool_out
+    return pool_out
+
+
+def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
+    """
+        This operation applies a 3D adaptive max pooling on input tensor.
+        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
+        Args:
+            x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
+            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+        Returns:
+            Tensor: The output tensor of adaptive max pool3d result. The data type is same as input tensor.
+        Examples:
+            .. code-block:: python
+              # adaptive max pool3d
+              # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
+              # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+              # of input data into m*n grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(l):
+              #         for j in range(m):
+              #             for k in range(n):
+              #                 dstart = floor(i * D / l)
+              #                 dend = ceil((i + 1) * D / l)
+              #                 hstart = floor(i * H / m)
+              #                 hend = ceil((i + 1) * H / m)
+              #                 wstart = floor(i * W / n)
+              #                 wend = ceil((i + 1) * W / n)
+              #             output[:, :, i, j, k] = max(input[:, :, dstart: dend, hstart: hend, wstart: wend])
+              #
+              import paddle
+              import numpy as np
+              paddle.disable_static()
+              input_data = np.random.rand(2, 3, 8, 32, 32)
+              x = paddle.to_tensor(input_data)
+              # x.shape is [2, 3, 8, 32, 32]
+              out = paddle.nn.functional.adaptive_max_pool3d(
+                            x = x,
+                            output_size=[3, 3, 3])
+              # out.shape is [2, 3, 3, 3, 3]
+    """
+
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_max_pool3d')
+    _check_input(x, 5)
+    #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool3d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool3d')
+
+    in_l, in_h, in_w = x.shape[2:5]
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 3, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_l
+        if output_size[1] == None:
+            output_size[1] = in_h
+        if output_size[2] == None:
+            output_size[2] = in_w
+
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool3d_with_index(
+            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
+        return pool_out if return_indices else pool_out[0]
+
+    l_type = 'max_pool3d_with_index'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        })
+
+    return (pool_out, mask) if return_indices else pool_out
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -66,16 +66,18 @@ from .common import Dropout  #DEFINE_ALIAS
 from .common import Dropout2D  #DEFINE_ALIAS
 from .common import Dropout3D  #DEFINE_ALIAS
 from .common import AlphaDropout  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
 from .pooling import AvgPool1d  #DEFINE_ALIAS
-from .pooling import MaxPool1d  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
 from .pooling import AvgPool2d  #DEFINE_ALIAS
-from .pooling import MaxPool2d  #DEFINE_ALIAS
 from .pooling import AvgPool3d  #DEFINE_ALIAS
+from .pooling import MaxPool1d  #DEFINE_ALIAS
+from .pooling import MaxPool2d  #DEFINE_ALIAS
 from .pooling import MaxPool3d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool2d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool3d  #DEFINE_ALIAS
 from .conv import Conv1d  #DEFINE_ALIAS
 from .conv import Conv2d  #DEFINE_ALIAS
 from .conv import Conv3d  #DEFINE_ALIAS

--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -99,7 +99,8 @@ class _ConvNd(layers.Layer):
                raise ValueError("in_channels must be divisible by groups.")

            if padding_mode in {'reflect', 'replicate', 'circular'}:
-                _paired_padding = utils.convert_to_list(padding, 2, 'padding')
+                _paired_padding = utils.convert_to_list(padding, dims,
+                                                        'padding')
                self._reversed_padding_repeated_twice = _reverse_repeat_list(
                    _paired_padding, 2)

@@ -318,62 +319,80 @@ class Conv2d(_ConvNd):
    output of the convolution, and the corresponding activation function is
    applied to the final result.
    For each input :math:`X`, the equation is:
+
    ..  math::
-        Out = \\sigma (W \\ast X + b)
+
+        Out = \sigma (W \\ast X + b)
+
    Where:
+
    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
    * :math:`\\ast`: Convolution operation.
    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
    * :math:`\\sigma`: Activation function.
    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    
    Parameters:
-        in_channels(int): The number of channels in the input image.
-        out_channels(int): The number of channels produced by convolution.
-        kernel_size (int|list|tuple): The size of convolution kernel.
-        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. The default value is 1.
        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
            1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`on both sides 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
            The default value is 0.
-        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` .
-        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: 1.
-        groups (int, optional): The groups number of the Conv2d Layer. According to grouped
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
            the first half of the filters is only connected to the first half
            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+            connected to the second half of the input channels. The default value is 1.
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d.
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, conv2d
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+            is not set, the bias is initialized zero. The default value is None.
+        data_format(str, optional): Data format that specifies the layout of input.
            It can be "NCHW" or "NHWC". Default: "NCHW".
+
    Attribute:
+
        **weight** (Parameter): the learnable weights of filter of this layer.
+
        **bias** (Parameter or None): the learnable bias of this layer.
+
    Shape:
+
        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
        Where
+
        ..  math::
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel_size[0] - 1) + 1))}{strides[0]} + 1 \\\\
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel_size[1] - 1) + 1))}{strides[1]} + 1
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
    Examples:
+
        .. code-block:: python
+
          import numpy as np
          import paddle
          import paddle.nn as nn
@@ -646,35 +665,29 @@ class ConvTranspose2d(_ConvNd):
    The details of convolution transpose layer, please refer to the following explanation and references
    `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
    For each input :math:`X`, the equation is:
+
    ..  math::
+
        Out = \sigma (W \\ast X + b)
+
    Where:
+
    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
    * :math:`\\ast`: Convolution operation.
    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
    * :math:`\\sigma`: Activation function.
    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+    
    Parameters:
        in_channels(int): The number of channels in the input image.
        out_channels(int): The number of channels produced by the convolution.
        kernel_size(int|list|uple): The kernel size. If kernel_size is a tuple,
            it must contain two integers, (kernel_size_H, kernel_size_W).
            Otherwise, the kernel will be a square.
-        output_padding(int|list|tuple, optional): Additional size added to one side
-            of each dimension in the output shape. Default: 0.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: 1.
        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
            1. a string in ['valid', 'same'].
            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides 
@@ -682,9 +695,8 @@ class ConvTranspose2d(_ConvNd):
            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
            The default value is 0.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
            contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: 1.
@@ -694,29 +706,46 @@ class ConvTranspose2d(_ConvNd):
            first half of the input channels, while the second half of the
            filters is only connected to the second half of the input channels.
            Default: 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
            will create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d_transpose.
+        bias_attr(ParamAttr|bool, optional): The attribute for the bias of conv2d_transpose.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, conv2d_transpose
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
            It can be "NCHW" or "NHWC". Default: "NCHW".
+
    Attribute:
+
        **weight** (Parameter): the learnable weights of filters of this layer.
+
        **bias** (Parameter or None): the learnable bias of this layer.
+
    Shape:
+
        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
        Where
+
        ..  math::
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
+
+           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel\_size[0] - 1) + 1
+
+           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel\_size[1] - 1) + 1
+
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] )
+
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+
    Examples:
+
       .. code-block:: python
+
          import numpy as np
          import paddle
          import paddle.nn as nn
@@ -791,66 +820,86 @@ class Conv3d(_ConvNd):
    provided, bias is added to the output of the convolution, and the
    corresponding activation function is applied to the final result.
    For each input :math:`X`, the equation is:
+
    ..  math::
+
        Out = \sigma (W \\ast X + b)
+
    In the above equation:
+
    * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
    * :math:`W`: Filter value, a tensor with MCDHW format.
    * :math:`\\ast`: Convolution operation.
    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
    * :math:`\\sigma`: Activation function.
    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
    Parameters:
        in_channels(int): The number of input channels in the input image.
        out_channels(int): The number of output channels produced by the convolution.
-        kernel_size (int|list|tuple, optional): The size of the convolving kernel.
-        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
            stride_D = stride_H = stride_W = stride. The default value is 1.
-        padding (int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
            1. a string in ['valid', 'same'].
            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
            The default value is 0.
-        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups (int, optional): The groups number of the Conv3d Layer. According to grouped
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
            the first half of the filters is only connected to the first half
            of the input channels, while the second half of the filters is only
            connected to the second half of the input channels. The default value is 1.
-        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
            of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
            will create ParamAttr as param_attr. If it is set to None, the parameter
            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, conv3d
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. The default value is None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
            It can be "NCDHW" or "NDHWC". Default: "NCDHW".
+
    Attribute:
+
        **weight** (Parameter): the learnable weights of filters of this layer.
+
        **bias** (Parameter): the learnable bias of this layer.
+
    Shape:
+
        - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
        - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
        Where
+
        ..  math::
-           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
+
+           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
+
    Raises:
        ValueError: If the shapes of input, filter_size, stride, padding and
                    groups mismatch.
+
    Examples:
+
        .. code-block:: python
+
          import numpy as np
          
          import paddle
@@ -936,17 +985,22 @@ class ConvTranspose3d(_ConvNd):
    the output of the convolution, and the corresponding activation function
    is applied to the final result.
    For each input :math:`X`, the equation is:
+    
    ..  math::
+
        Out = \sigma (W \\ast X + b)
+
    In the above equation:
+
    * :math:`X`: Input value, a tensor with NCDHW format.
    * :math:`W`: Filter value, a tensor with MCDHW format.
    * :math:`\\ast`: Convolution operation.
    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
    * :math:`\\sigma`: Activation function.
    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
+
    **Note**:
+
          The conv_transpose3d can be seen as the backward of the conv3d. For conv3d, 
          when stride > 1, conv3d maps multiple input shape to the same output shape, 
          so for conv_transpose3d, when stride > 1, input shape maps multiple output shape.
@@ -957,6 +1011,7 @@ class ConvTranspose3d(_ConvNd):
          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
          conv_transpose3d can compute the kernel size automatically.
+
    Parameters:
        in_channels(int): The number of channels in the input image.
        out_channels(int): The number of channels produced by the convolution.
@@ -985,11 +1040,11 @@ class ConvTranspose3d(_ConvNd):
            first half of the input channels, while the second half of the
            filters is only connected to the second half of the input channels.
            The default value is 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
            of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
            will create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, conv3d_transpose
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
@@ -999,24 +1054,38 @@ class ConvTranspose3d(_ConvNd):
            filter_size, padding, and stride to calculate output_size.
            if output_size and filter_size are specified at the same time, They
            should follow the formula above. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
            It can be "NCDHW" or "NDHWC". Default: "NCDHW".
+
    Attribute:
+
        **weight** (Parameter): the learnable weights of filters of this layer.
+
        **bias** (Parameter): the learnable bias of this layer.
+
    Shape:
+
        - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
        - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
        Where
+
        ..  math::
-           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
-           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel_size[2] - 1) + 1 \\\\
+
+           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel\_size[0] - 1) + 1
+           
+           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel\_size[1] - 1) + 1
+           
+           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel\_size[2] - 1) + 1
+           
    Raises:
        ValueError: If the shapes of input, filter_size, stride, padding and
                    groups mismatch.
    Examples:
+
       .. code-block:: python
+
          import numpy as np
          import paddle
          import paddle.nn as nn
@@ -1024,7 +1093,7 @@ class ConvTranspose3d(_ConvNd):
          
          paddle.disable_static()
          x_var = paddle.to_tensor(x)
-          conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
+          conv = nn.ConvTranspose3d(4, 6, (3, 3, 3))
          y_var = conv(x_var)
          y_np = y_var.numpy()
          print(y_np.shape)

--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -634,9 +634,12 @@ class KLDivLoss(fluid.dygraph.Layer):
            Default is ``'mean'``.

    Shape:
-      - input: (N, *) where * means, any number of additional dimensions.
-      - label: (N, *), same shape as input
-      - output: tensor with shape: (1) by default.
+
+        - input (Tensor): (N, *), where * means, any number of additional dimensions.
+
+        - label (Tensor): (N, *), same shape as input.
+
+        - output (Tensor): tensor with shape: [1] by default.


    Examples:
@@ -646,7 +649,7 @@ class KLDivLoss(fluid.dygraph.Layer):
            import numpy as np
            import paddle.nn as nn

-            paddle.enable_imperative()
+            paddle.disable_static()

            shape = (5, 20)
            x = np.random.uniform(-10, 10, shape).astype('float32')
@@ -654,26 +657,26 @@ class KLDivLoss(fluid.dygraph.Layer):

            # 'batchmean' reduction, loss shape will be [N]
            kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
            # shape=[5]

            # 'mean' reduction, loss shape will be [1]
            kldiv_criterion = nn.KLDivLoss(reduction='mean')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
            # shape=[1]

            # 'sum' reduction, loss shape will be [1]
            kldiv_criterion = nn.KLDivLoss(reduction='sum')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
            # shape=[1]

            # 'none' reduction, loss shape is same with X shape
            kldiv_criterion = nn.KLDivLoss(reduction='none')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
            # shape=[5, 20]
    """


--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -27,6 +27,7 @@

 # TODO: define normalization api  

+import six
 from ...fluid.dygraph.nn import InstanceNorm

 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
@@ -36,7 +37,6 @@ from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS

 from ...fluid.dygraph import layers
-
 from ...framework import get_default_dtype, set_default_dtype
 from ...fluid.framework import in_dygraph_mode

@@ -50,6 +50,7 @@ from ..functional import batch_norm, layer_norm, instance_norm
 import numpy as np
 import numbers
 import warnings
+from ...fluid.dygraph.base import no_grad

 __all__ = [
    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
@@ -566,17 +567,28 @@ class _BatchNormBase(layers.Layer):
        param_shape = [num_features]

        # create parameter
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
            self.weight = self.create_parameter(
                attr=self._weight_attr,
                shape=param_shape,
                default_initializer=Constant(1.0))
-        self.weight.stop_gradient = (self._weight_attr is False) or (
-            self._weight_attr and self._weight_attr.learning_rate == 0.)
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.

+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
            self.bias = self.create_parameter(
                attr=self._bias_attr, shape=param_shape, is_bias=True)
-        self.bias.stop_gradient = (self._bias_attr is False) or (
-            self._bias_attr and self._bias_attr.learning_rate == 0.)
+            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.

        moving_mean_name = None
        moving_variance_name = None
@@ -611,6 +623,7 @@ class _BatchNormBase(layers.Layer):
        self._epsilon = epsilon
        self._fuse_with_relu = False
        self._track_running_stats = track_running_stats
+        self._name = name

    def _check_input_dim(self, input):
        raise NotImplementedError("BatchNorm Base error")
@@ -898,7 +911,7 @@ class BatchNorm3d(_BatchNormBase):
                len(input.shape)))


-class SyncBatchNorm(layers.Layer):
+class SyncBatchNorm(_BatchNormBase):
    """
    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
@@ -984,72 +997,16 @@ class SyncBatchNorm(layers.Layer):

    def __init__(self,
                 num_features,
-                 epsilon=1e-05,
                 momentum=0.9,
-                 track_running_stats=True,
+                 epsilon=1e-05,
                 weight_attr=None,
                 bias_attr=None,
                 data_format='NCHW',
+                 track_running_stats=True,
                 name=None):
-        super(SyncBatchNorm, self).__init__()
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._num_features = num_features
-        self._data_layout = data_format
-        self._momentum = momentum
-        self._epsilon = epsilon
-        self._track_running_stats = track_running_stats
-
-        if self._track_running_stats == False:
-            warnings.warn(
-                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
-            )
-
-        param_shape = [self._num_features]
-
-        # create parameter
-        if weight_attr == False:
-            self.weight = self.create_parameter(
-                attr=None, shape=param_shape, default_initializer=Constant(1.0))
-            self.weight.stop_gradient = True
-        else:
-            self.weight = self.create_parameter(
-                attr=self._weight_attr,
-                shape=param_shape,
-                default_initializer=Constant(1.0))
-            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        if bias_attr == False:
-            self.bias = self.create_parameter(
-                attr=None,
-                shape=param_shape,
-                default_initializer=Constant(0.0),
-                is_bias=True)
-            self.bias.stop_gradient = True
-        else:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr, shape=param_shape, is_bias=True)
-            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        self._mean = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._mean.stop_gradient = True
-
-        self._variance = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._variance.stop_gradient = True
+        super(SyncBatchNorm,
+              self).__init__(num_features, momentum, epsilon, weight_attr,
+                             bias_attr, data_format, track_running_stats, name)

    def forward(self, x):
        # create output
@@ -1063,7 +1020,7 @@ class SyncBatchNorm(layers.Layer):
        if in_dygraph_mode():
            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                     "is_test", not self.training, "data_layout",
-                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
+                     self._data_format, "use_mkldnn", False, "fuse_with_relu",
                     False, "use_global_stats", False, 'trainable_statistics',
                     False)
            sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
@@ -1073,13 +1030,13 @@ class SyncBatchNorm(layers.Layer):
            return sync_batch_norm_out

        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'BatchNorm')
+                                 'SyncBatchNorm')

        attrs = {
            "momentum": self._momentum,
            "epsilon": self._epsilon,
            "is_test": not self.training,
-            "data_layout": self._data_layout,
+            "data_layout": self._data_format,
            "use_mkldnn": False,
            "fuse_with_relu": False,
            "use_global_stats": False,
@@ -1112,3 +1069,45 @@ class SyncBatchNorm(layers.Layer):
        self._helper.append_op(
            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
        return sync_batch_norm_out
+
+    @classmethod
+    def convert_sync_batchnorm(cls, layer):
+        """
+        Helper function to convert :class: `paddle.nn.BatchNorm*d` layers in the model to :class: `paddle.nn.SyncBatchNorm` layers.
+
+        Parameters:
+            layer(paddle.nn.Layer): model containing one or more `BatchNorm*d` layers.
+
+        Returns:
+            The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.
+
+        Examples:
+
+            .. code-block:: python
+                import paddle
+                import paddle.nn as nn
+
+                paddle.disable_static()
+                model = nn.Sequential(nn.Conv2d(3, 5, 3), nn.BatchNorm2d(5))
+                sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+        """
+        layer_output = layer
+        if isinstance(layer, _BatchNormBase):
+            layer_output = SyncBatchNorm(layer._num_features, layer._epsilon,
+                                         layer._momentum, layer._weight_attr,
+                                         layer._bias_attr, layer._data_format,
+                                         layer._name)
+
+            if layer._weight_attr != False and layer._bias_attr != False:
+                with no_grad():
+                    layer_output.weight = layer.weight
+                    layer_output.bias = layer.bias
+            layer_output._mean = layer._mean
+            layer_output._variance = layer._variance
+
+        for name, sublayer in layer.named_sublayers():
+            layer_output.add_sublayer(name,
+                                      cls.convert_sync_batchnorm(sublayer))
+        del layer
+        return layer_output
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -12,198 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import paddle
-
-from ...fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
-from ...fluid.layers import utils
 from ...fluid.dygraph import layers
 from ...fluid.layer_helper import LayerHelper
 from .. import functional as F

 __all__ = [
-    'AdaptiveAvgPool2d',
-    'AdaptiveAvgPool3d',
    'AvgPool1d',
-    'maxPool1d',
-    'AdaptiveMaxPool1d',
-    'AdaptiveAvgPool1d',
    'AvgPool2d',
-    'MaxPool2d',
    'AvgPool3d',
+    'MaxPool1d',
+    'MaxPool2d',
    'MaxPool3d',
+    'AdaptiveAvgPool1d',
+    'AdaptiveAvgPool2d',
+    'AdaptiveAvgPool3d',
+    'AdaptiveMaxPool1d',
+    'AdaptiveMaxPool2d',
+    'AdaptiveMaxPool3d',
 ]


-class AdaptiveAvgPool2d(layers.Layer):
-    """
-
-    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-
-    For avg adaptive pool2d:
-
-    ..  math::
-
-       hstart &= floor(i * H_{in} / H_{out})
-
-       hend &= ceil((i + 1) * H_{in} / H_{out})
-
-       wstart &= floor(j * W_{in} / W_{out})
-
-       wend &= ceil((j + 1) * W_{in} / W_{out})
-
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
-
-
-    Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two element, (H, W). H and W can be either a int, or None which means
-            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
-            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
-            the order of: [batch_size, input_channels, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-    Shape:
-        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32 or float64.
-        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
-
-    Returns:
-        A callable object of AdaptiveAvgPool2d.
-
-    Examples:
-        .. code-block:: python
-
-            # adaptive avg pool2d
-            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
-            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
-            # of input data into m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(m):
-            #         for j in range(n):
-            #             hstart = floor(i * H / m)
-            #             hend = ceil((i + 1) * H / m)
-            #             wstart = floor(i * W / n)
-            #             wend = ceil((i + 1) * W / n)
-            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
-            #
-            import paddle
-            import numpy as np
-            paddle.disable_static()
-            input_data = np.random.rand(2, 3, 32, 32)
-            x = paddle.to_tensor(input_data)
-            # x.shape is [2, 3, 32, 32]
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
-            pool_out = adaptive_avg_pool(x = x)
-            # pool_out.shape is [2, 3, 3, 3]
-    """
-
-    def __init__(self, output_size, data_format="NCHW", name=None):
-        super(AdaptiveAvgPool2d, self).__init__()
-        self._output_size = output_size
-        self._data_format = data_format
-        self._name = name
-
-    def forward(self, x):
-        return F.adaptive_avg_pool2d(
-            x,
-            output_size=self._output_size,
-            data_format=self._data_format,
-            name=self._name)
-
-
-class AdaptiveAvgPool3d(layers.Layer):
-    """
-
-    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-
-    For avg adaptive pool3d:
-
-    ..  math::
-
-      dstart &= floor(i * D_{in} / D_{out})
-
-      dend &= ceil((i + 1) * D_{in} / D_{out})
-
-      hstart &= floor(j * H_{in} / H_{out})
-
-      hend &= ceil((j + 1) * H_{in} / H_{out})
-
-      wstart &= floor(k * W_{in} / W_{out})
-
-      wend &= ceil((k + 1) * W_{in} / W_{out})
-
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-
-
-    Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
-            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
-            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
-            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-    Shape:
-        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32 or float64.
-        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
-
-    Returns:
-        A callable object of AdaptiveAvgPool3d.
-
-    Examples:
-        .. code-block:: python
-
-            # adaptive avg pool3d
-            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
-            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
-            # of input data into l * m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(l):
-            #         for j in range(m):
-            #             for k in range(n):
-            #                 dstart = floor(i * D / l)
-            #                 dend = ceil((i + 1) * D / l)
-            #                 hstart = floor(j * H / m)
-            #                 hend = ceil((j + 1) * H / m)
-            #                 wstart = floor(k * W / n)
-            #                 wend = ceil((k + 1) * W / n)
-            #                 output[:, :, i, j, k] =
-            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
-            import paddle
-            import numpy as np
-            paddle.disable_static()
-            input_data = np.random.rand(2, 3, 8, 32, 32)
-            x = paddle.to_tensor(input_data)
-            # x.shape is [2, 3, 8, 32, 32]
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
-            pool_out = adaptive_avg_pool(x = x)
-            # pool_out = [2, 3, 3, 3, 3]
-    """
-
-    def __init__(self, output_size, data_format="NCDHW", name=None):
-        super(AdaptiveAvgPool3d, self).__init__()
-        self._output_size = output_size
-        self._data_format = data_format
-        self._name = name
-
-    def forward(self, x):
-        return F.adaptive_avg_pool3d(
-            x,
-            output_size=self._output_size,
-            data_format=self._data_format,
-            name=self._name)
-
-
 class AvgPool1d(layers.Layer):
    """
    This operation applies a 1D average pooling over an input signal composed
@@ -223,17 +51,20 @@ class AvgPool1d(layers.Layer):

    Args:
        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+            it must contain an integer.
        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
-            then the input is implicitly zero-padded on both sides for padding number of points.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
+                          mode, default is `True`.
        ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
-            If it is set to False, the floor function will be used. Default False
+            If it is set to False, the floor function will be used. The default value is False.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
@@ -245,10 +76,14 @@ class AvgPool1d(layers.Layer):
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
+        ShapeError: If the input is not a 3-D tensor.
        ShapeError: If the output's shape calculated is not greater than 0.


+    Shape:
+        - inpuut: 3-D tensor.
+        - output: 3-D tensor
+
    Examples:

        .. code-block:: python
@@ -284,63 +119,74 @@ class AvgPool1d(layers.Layer):
        return out


-class MaxPool1d(layers.Layer):
+class AvgPool2d(layers.Layer):
    """
-    Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
+    This operation applies 2D average pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.

-    ..  math::
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize

-       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           $$

    Args:
       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
-            If it is set to False, the floor function will be used. Default False
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.

-    Returns:
-        None.
+    Shape:
+        - x: 4-D tensor.
+        - out: 2-D tensor

+    Returns: None.
    Raises:
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
        ShapeError: If the output's shape calculated is not greater than 0.
-
-
    Examples:
-
        .. code-block:: python
-
          import paddle
          import paddle.nn as nn
+          import numpy as np
          paddle.disable_static()

-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
-          pool_out = MaxPool1d(data)
-          # pool_out shape: [1, 3, 16]
-
-          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
-          pool_out, indices = MaxPool1d(data)
-          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          AvgPool2d = nn.AvgPool2d(kernel_size=2,
+                                stride=2, padding=0)
+          output = AvgPoo2d(input)
+          # output.shape [1, 3, 16, 16]

    """

@@ -348,113 +194,155 @@ class MaxPool1d(layers.Layer):
                 kernel_size,
                 stride=None,
                 padding=0,
-                 return_indices=False,
                 ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCHW",
                 name=None):
-        super(MaxPool1d, self).__init__()
-        self.kernel_size = kernel_size
+        super(AvgPool2d, self).__init__()
+        self.ksize = kernel_size
        self.stride = stride
        self.padding = padding
        self.ceil_mode = ceil_mode
-        self.return_indices = return_indices
-        self.name = name
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
+        self.name = name

-    def forward(self, input):
-        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
-                           self.return_indices, self.ceil_mode, self.name)
-        return out
+    def forward(self, x):
+        return F.avg_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)


-class AdaptiveAvgPool1d(layers.Layer):
+class AvgPool3d(layers.Layer):
    """
-
-    This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For average adaptive pool1d:
-
-    ..  math::
-
-       lstart &= floor(i * L_{in} / L_{out})
-
-       lend &= ceil((i + 1) * L_{in} / L_{out})
-
-       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.

    Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one int.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.

-    Returns:
-        None.
-
+    Returns: None.
    Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 5-D tensor.
+        - out: 5-D tensor.

    Examples:
        .. code-block:: python
-
-          # average adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-          #
          import paddle
          import paddle.nn as nn
+          import numpy as np
          paddle.disable_static()

-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
-          pool_out = AdaptiveAvgPool1d(data)
-          # pool_out shape: [1, 3, 16]
+          # avg pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+          AvgPool3d = nn.AvgPool3d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = AvgPool3d(input)
+          # output.shape [1, 2, 3, 16, 16]
+
    """

-    def __init__(self, output_size, name=None):
-        super(AdaptiveAvgPool1d, self).__init__()
-        self.output_size = output_size
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding=0,
+                 ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCDHW",
+                 name=None):
+        super(AvgPool3d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
        self.name = name

-    def forward(self, input):
-        return F.adaptive_avg_pool1d(input, self.output_size, self.name)
+    def forward(self, x):
+        return F.avg_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)


-class AdaptiveMaxPool1d(layers.Layer):
+class MaxPool1d(layers.Layer):
    """
-
-    This operation applies a 1D adaptive max pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
+    Applies a 1D max pooling over an input signal composed of several input planes based
+    on the input, output_size, return_indices parameters.
    Input(X) and output(Out) are in NCL format, where N is batch
    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].

-    For max adaptive pool1d:
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:

    ..  math::

-       lstart &= floor(i * L_{in} / L_{out})
-
-       lend &= ceil((i + 1) * L_{in} / L_{out})
-
-       Output(i) &= max(Input[lstart:lend])}
+       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}

    Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-             it must contain one int.
-        return_indices (bool): If true, the index of max pooling point will be returned along
-            with outputs. It cannot be set in average pooling type. Default False.
+       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An integer, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
@@ -462,53 +350,60 @@ class AdaptiveMaxPool1d(layers.Layer):
        None.

    Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Shape:
+        - x: 3-D tensor.
+        - out: 3-D tensor.

    Examples:
+
        .. code-block:: python

-          # max adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = max(input[:, :, lstart: lend])
-          #
          import paddle
          import paddle.nn as nn
          paddle.disable_static()

          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
-          pool_out = AdaptiveMaxPool1d(data)
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
+          pool_out = MaxPool1d(data)
          # pool_out shape: [1, 3, 16]

-          # for return_indices = true
-          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
-          pool_out, indices = AdaptiveMaxPool1d(data)
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
+          pool_out, indices = MaxPool1d(data)
          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]

    """

-    def __init__(self, output_size, return_indices=False, name=None):
-        super(AdaptiveMaxPool1d, self).__init__()
-        self.output_size = output_size
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 return_indices=False,
+                 ceil_mode=False,
+                 name=None):
+        super(MaxPool1d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
        self.return_indices = return_indices
        self.name = name

    def forward(self, input):
-        return F.adaptive_max_pool1d(input, self.output_size,
-                                     self.return_indices, self.name)
+        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
+                           self.return_indices, self.ceil_mode, self.name)
+        return out


-class AvgPool2d(layers.Layer):
+class MaxPool2d(layers.Layer):
    """
-    This operation applies 2D average pooling over input features based on the input,
+    This operation applies 2D max pooling over input feature based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
    in NCHW format, where N is batch size, C is the number of channels,
    H is the height of the feature, and W is the width of the feature.
@@ -522,8 +417,9 @@ class AvgPool2d(layers.Layer):
      Output:
           Out shape: $(N, C, H_{out}, W_{out})$
           $$
-           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
           $$

    Args:
@@ -532,31 +428,33 @@ class AvgPool2d(layers.Layer):
            Otherwise, the pool kernel size will be a square of an int.
        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
+            Otherwise, the pool stride size will be a square of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        return_indices (bool): Whether to return the max indices along with the outputs.
        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.

-    Returns: None.
+    Returns: None
    Raises:
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
        ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 4-D tensor.
+        - out: 4-D tensor.
+
    Examples:
        .. code-block:: python
          import paddle
@@ -566,95 +464,87 @@ class AvgPool2d(layers.Layer):

          # max pool2d
          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          AvgPool2d = nn.AvgPool2d(kernel_size=2,
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,
                                   stride=2, padding=0)
-          output = AvgPoo2d(input)
+          output = MaxPool2d(input)
          # output.shape [1, 3, 16, 16]

+          # for return_indices=True
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool2d(input)
+          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
    """

    def __init__(self,
                 kernel_size,
                 stride=None,
                 padding=0,
+                 return_indices=False,
                 ceil_mode=False,
-                 count_include_pad=True,
-                 divisor_override=None,
                 data_format="NCHW",
                 name=None):
-        super(AvgPool2d, self).__init__()
+        super(MaxPool2d, self).__init__()
        self.ksize = kernel_size
        self.stride = stride
        self.padding = padding
+        self.return_indices = return_indices
        self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
-        self.divisor = divisor_override
        self.data_format = data_format
        self.name = name

    def forward(self, x):
-        return F.avg_pool2d(
+        return F.max_pool2d(
            x,
            kernel_size=self.ksize,
            stride=self.stride,
            padding=self.padding,
-            ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
-            divisor_override=self.divisor,
+            return_indices=self.return_indices,
            data_format=self.data_format,
            name=self.name)


-class MaxPool2d(layers.Layer):
+class MaxPool3d(layers.Layer):
    """
-    This operation applies 2D max pooling over input feature based on the input,
+    This operation applies 3D max pooling over input features based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
-                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                                                   \text{stride[1]} \times w + n)
-           $$
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.

    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int.
+        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
+            is a tuple or list, it must contain three integers,
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
        return_indices (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.

-    Returns: None
+
+    Returns:None.
    Raises:
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
        ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 5-D tensor.
+        - out: 5-D tensor.
+
    Examples:
        .. code-block:: python
          import paddle
@@ -662,28 +552,28 @@ class MaxPool2d(layers.Layer):
          import numpy as np
          paddle.disable_static()

-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          MaxPool2d = nn.MaxPool2d(kernel_size=2,
+          # max pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+          MaxPool3d = nn.MaxPool3d(kernel_size=2,
                                   stride=2, padding=0)
-          output = MaxPool2d(input)
-          # output.shape [1, 3, 16, 16]
+          output = MaxPool3d(input)
+          # output.shape [1, 2, 3, 16, 16]

          # for return_indices=True
-          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
-          output, max_indices = MaxPool2d(input)
-          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+          MaxPool3d = nn.MaxPool3d(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool3d(input)
+          # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
    """

    def __init__(self,
                 kernel_size,
-                 stride=None,
-                 padding=0,
+                 stride,
+                 padding,
                 return_indices=False,
                 ceil_mode=False,
-                 data_format="NCHW",
+                 data_format="NCDHW",
                 name=None):
-        super(MaxPool2d, self).__init__()
+        super(MaxPool3d, self).__init__()
        self.ksize = kernel_size
        self.stride = stride
        self.padding = padding
@@ -693,7 +583,7 @@ class MaxPool2d(layers.Layer):
        self.name = name

    def forward(self, x):
-        return F.max_pool2d(
+        return F.max_pool3d(
            x,
            kernel_size=self.ksize,
            stride=self.stride,
@@ -703,175 +593,457 @@ class MaxPool2d(layers.Layer):
            name=self.name)


-class MaxPool3d(layers.Layer):
+class AdaptiveAvgPool1d(layers.Layer):
    """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    This operation applies a 1D adaptive average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For average adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}

    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
-            is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int. Default kernel_size.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): when True, will use ceil instead of floor to compute the output shape.
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one int.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.

+    Returns:
+        None.

-    Returns:None.
    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Shape:
+        - x: 3-D tensor.
+        - out: 3-D tensor.
+
    Examples:
        .. code-block:: python
+
+          # average adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+          #
          import paddle
          import paddle.nn as nn
-          import numpy as np
          paddle.disable_static()

-          # max pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          MaxPool3d = nn.MaxPool3d(kernel_size=2,
-                                   stride=2, padding=0)
-          output = MaxPool3d(input)
-          # output.shape [1, 2, 3, 16, 16]
-
-          # for return_indices=True
-          MaxPool3d = nn.MaxPool3d(kernel_size=2,stride=2, padding=0, return_indices=True)
-          output, max_indices = MaxPool3d(input)
-          # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
+          pool_out = AdaptiveAvgPool1d(data)
+          # pool_out shape: [1, 3, 16]
    """

-    def __init__(self,
-                 kernel_size,
-                 stride,
-                 padding,
-                 return_indices=False,
-                 ceil_mode=False,
-                 data_format="NCDHW",
-                 name=None):
-        super(MaxPool3d, self).__init__()
-        self.ksize = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.return_indices = return_indices
-        self.ceil_mode = ceil_mode
-        self.data_format = data_format
+    def __init__(self, output_size, name=None):
+        super(AdaptiveAvgPool1d, self).__init__()
+        self.output_size = output_size
        self.name = name

-    def forward(self, x):
-        return F.max_pool3d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            return_indices=self.return_indices,
-            data_format=self.data_format,
-            name=self.name)
+    def forward(self, input):
+        return F.adaptive_avg_pool1d(input, self.output_size, self.name)


-class AvgPool3d(layers.Layer):
+class AdaptiveAvgPool2d(layers.Layer):
    """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.

-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
-            is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two element, (H, W). H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
+            the order of: [batch_size, input_channels, input_height, input_width].
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.

-    Returns: None.
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool2d.
+
    Examples:
        .. code-block:: python
+
+            # adaptive avg pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            #
            import paddle
-          import paddle.nn as nn
            import numpy as np
            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out.shape is [2, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCHW", name=None):
+        super(AdaptiveAvgPool2d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool2d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)

-          # avg pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          AvgPool3d = nn.AvgPool3d(kernel_size=2,
-                                   stride=2, padding=0)
-          output = AvgPool3d(input)
-          # output.shape [1, 2, 3, 16, 16]

+class AdaptiveAvgPool3d(layers.Layer):
    """

-    def __init__(self,
-                 kernel_size,
-                 stride,
-                 padding=0,
-                 ceil_mode=False,
-                 count_include_pad=True,
-                 divisor_override=None,
-                 data_format="NCDHW",
-                 name=None):
-        super(AvgPool3d, self).__init__()
-        self.ksize = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
-        self.divisor = divisor_override
-        self.data_format = data_format
+    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
+            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool3d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 8, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out = [2, 3, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCDHW", name=None):
+        super(AdaptiveAvgPool3d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool3d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AdaptiveMaxPool1d(layers.Layer):
+    """
+
+    This operation applies a 1D adaptive max pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For max adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= max(Input[lstart:lend])}
+
+    Args:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+             it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Returns:
+        None.
+
+    Raises:
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type is same as input x.
+
+    Examples:
+        .. code-block:: python
+
+          # max adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = max(input[:, :, lstart: lend])
+          #
+                    import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
+          pool_out = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+          # for return_indices = true
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
+          pool_out, indices = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool1d, self).__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
        self.name = name

+    def forward(self, input):
+        return F.adaptive_max_pool1d(input, self.output_size,
+                                     self.return_indices, self.name)
+
+
+class AdaptiveMaxPool2d(layers.Layer):
+    """
+    This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    For adaptive max pool2d:
+    ..  math::
+       hstart &= floor(i * H_{in} / H_{out})
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+       wstart &= floor(j * W_{in} / W_{out})
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+       Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
+        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x.
+    
+    Returns:
+        A callable object of AdaptiveMaxPool2d.
+    Examples:
+        .. code-block:: python
+            # adaptive max pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=3, return_indices=True)
+            pool_out, indices = adaptive_max_pool(x = x)
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool2d, self).__init__()
+        self._output_size = output_size
+        self._return_indices = return_indices
+        self._name = name
+
    def forward(self, x):
-        return F.avg_pool3d(
+        return F.adaptive_max_pool2d(
            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
-            divisor_override=self.divisor,
-            data_format=self.data_format,
-            name=self.name)
+            output_size=self._output_size,
+            return_indices=self._return_indices,
+            name=self._name)
+
+
+class AdaptiveMaxPool3d(layers.Layer):
+    """
+   This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    For adaptive max pool3d:
+    ..  math::
+      dstart &= floor(i * D_{in} / D_{out})
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+      hstart &= floor(j * H_{in} / H_{out})
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+      wstart &= floor(k * W_{in} / W_{out})
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+      Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type is same as input x.
+    Returns:
+        A callable object of AdaptiveMaxPool3d.
+    Examples:
+        .. code-block:: python
+            # adaptive max pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     max(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            pool = paddle.nn.AdaptiveMaxPool3d(output_size=4)
+            out = pool(x)
+            # out shape: [2, 3, 4, 4, 4]
+            pool, indices = paddle.nn.AdaptiveMaxPool3d(output_size=3, return_indices=True)
+            out = pool(x)
+            # out shape: [2, 3, 4, 4, 4], indices shape: [2, 3, 4, 4, 4]
+            
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool3d, self).__init__()
+        self._output_size = output_size
+        self._return_indices = return_indices
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_max_pool3d(
+            x,
+            output_size=self._output_size,
+            return_indices=self._return_indices,
+            name=self._name)
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -26,9 +26,8 @@ __all__ = [
 ]


-from ..fluid.optimizer import  SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \
-            Ftrl, Adadelta, \
-            SGDOptimizer, MomentumOptimizer, AdagradOptimizer,DpsgdOptimizer,\
+from ..fluid.optimizer import Momentum, Adagrad, Dpsgd, DecayedAdagrad, Ftrl,\
+            AdagradOptimizer,DpsgdOptimizer,\
            DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
            ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
            ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
@@ -39,6 +38,9 @@ from .adam import Adam
 from .adamw import AdamW
 from .adamax import Adamax
 from .rmsprop import RMSProp
+from .adadelta import Adadelta
+from .sgd import SGD
+from .momentum import Momentum

 from . import lr_scheduler
 from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \

--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+
+__all__ = ["Adadelta"]
+
+
+class Adadelta(Optimizer):
+    """
+    **Notes: This API does not support sparse parameter optimization.**
+
+    Adadelta Optimizer. Please refer to this for details:
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.
+
+    The update is done as follows:
+
+    .. math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2
+
+        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) }
+
+        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
+
+    Args:
+	learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
+        rho (float): a floating point value indicating the decay rate. Default 0.95.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization. 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            adadelta = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            adadelta.step()
+            adadelta.clear_grad()
+
+    """
+
+    _avg_squared_grad_acc_str = "_avg_squared_grad"
+    _avg_squared_update_acc_str = "_avg_squared_update"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 epsilon=1.0e-6,
+                 rho=0.95,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        super(Adadelta, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "adadelta"
+        self._epsilon = epsilon
+        self._rho = rho
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._avg_squared_grad_acc_str, p)
+            self._add_accumulator(self._avg_squared_update_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        avg_squared_grad_acc = self._get_accumulator(
+            self._avg_squared_grad_acc_str, param_and_grad[0])
+        avg_squared_update_acc = self._get_accumulator(
+            self._avg_squared_update_acc_str, param_and_grad[0])
+
+        # Create the adadelta optimizer op
+        adadelta_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "AvgSquaredGrad": avg_squared_grad_acc,
+                "AvgSquaredUpdate": avg_squared_update_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "AvgSquaredGradOut": avg_squared_grad_acc,
+                "AvgSquaredUpdateOut": avg_squared_update_acc
+            },
+            attrs={"epsilon": self._epsilon,
+                   "rho": self._rho},
+            stop_gradient=True)
+
+        return adadelta_op
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+
+__all__ = ["Momentum"]
+
+
+class Momentum(Optimizer):
+    """
+
+    Simple Momentum optimizer with velocity state
+
+    This optimizer has a flag for Nestrov Momentum.
+
+    The update equations are as follows:
+
+    .. math::
+
+        & velocity = mu * velocity + gradient
+
+        & if (use\_nesterov):
+
+        &\quad   param = param - (gradient + mu * velocity) * learning\_rate
+
+        & else:
+
+        &\quad   param = param - learning\_rate * velocity
+
+    Parameters:
+
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        momentum (float): Momentum factor. The default value is 0.9.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            momentum = paddle.optimizer.Momentum(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            momentum.step()
+            momentum.clear_grad()
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 momentum=0.9,
+                 parameters=None,
+                 use_nesterov=False,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set")
+        if momentum is None:
+            raise ValueError("momentum is not set")
+        super(Momentum, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "momentum"
+        self._momentum = momentum
+        self._use_nesterov = bool(use_nesterov)
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(self._velocity_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        lr = self._create_param_lr(param_and_grad)
+
+        if framework.in_dygraph_mode():
+            _, _ = core.ops.momentum(param_and_grad[0], param_and_grad[1],
+                                     velocity_acc, lr, param_and_grad[0],
+                                     velocity_acc, 'mu', self._momentum,
+                                     'use_nesterov', self._use_nesterov)
+            return None
+
+        attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov}
+        inputs = {
+            "Param": [param_and_grad[0]],
+            "Grad": [param_and_grad[1]],
+            "Velocity": [velocity_acc],
+            "LearningRate": [lr]
+        }
+
+        outputs = {
+            "ParamOut": [param_and_grad[0]],
+            "VelocityOut": [velocity_acc]
+        }
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True)
+
+        return momentum_op
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+from ..fluid.dygraph import no_grad
+__all__ = ["SGD"]
+
+
+class SGD(Optimizer):
+    """
+    Optimizer of the stochastic gradient descent algorithm.
+
+    .. math::
+
+        param\_out = param - learning\_rate * grad
+
+    Parameters:
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` . 
+        
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            sgd.step()
+            sgd.clear_grad()
+
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set")
+        super(SGD, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "sgd"
+
+    @no_grad()
+    def _append_optimize_op(self, block, param_and_grad):
+        lr = self._create_param_lr(param_and_grad)
+        if framework.in_dygraph_mode():
+            core.ops.sgd(param_and_grad[0], lr, param_and_grad[1],
+                         param_and_grad[0])
+            return None
+
+        assert isinstance(block, framework.Block)
+        # create the optimize op
+        sgd_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": lr
+            },
+            outputs={"ParamOut": param_and_grad[0]},
+            stop_gradient=True)
+
+        return sgd_op
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1618,6 +1618,10 @@ def clip(x, min=None, max=None, name=None):
    fmax = float(np.finfo(np_dtype).max)

    if in_dygraph_mode():
+        if isinstance(min, Variable):
+            min = min.numpy().item(0)
+        if isinstance(max, Variable):
+            max = max.numpy().item(0)
        min = fmin if min is None else min
        max = fmax if max is None else max
        return core.ops.clip(x, "min", min, "max", max)

--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -94,7 +94,7 @@ def bernoulli(x, name=None):
    return out


-def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
+def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
    """
    This OP returns a Tensor filled with random values sampled from a Gaussian
    distribution, with ``shape`` and ``dtype``.
@@ -109,9 +109,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
        std(float|int, optional): Standard deviation of the output tensor, default
            is 1.0.
        seed(int, optional): ${seed_comment}
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
-            the output Tensor. Supported data types: float32, float64.
-            Default is float32.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
        name(str, optional): The default value is None. Normally there is no
            need for user to set this property. For more information, please
            refer to :ref:`api_guide_Name`.
@@ -120,6 +121,13 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
        Tensor: A Tensor filled with random values sampled from a Gaussian
        distribution, with ``shape`` and ``dtype``. 
    """
+    if dtype is None:
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "gaussian_random only supports [float32, float64], but the default dtype is %s"
+                % dtype)
+
    if not isinstance(dtype, core.VarDesc.VarType):
        dtype = convert_np_dtype_to_dtype_(dtype)
    seed = 0
@@ -169,9 +177,10 @@ def standard_normal(shape, dtype=None, name=None):
            (with the shape [1], and the data type int32 or int64). If ``shape``
            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
            int64).
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
-            output tensor. Supported data types: float32, float64. If ``dytpe``
-            is None, the data type is float32. Default is None.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
        name (str, optional): Name for the operation (optional, default is None).
            For more information, please refer to :ref:`api_guide_Name`.

@@ -216,7 +225,11 @@ def standard_normal(shape, dtype=None, name=None):

    """
    if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "standard_normal only supports [float32, float64], but the default dtype is %s"
+                % dtype)

    return gaussian_random(
        shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
@@ -325,7 +338,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
    return out


-def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):
+def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
    """
    This OP returns a Tensor filled with random values sampled from a uniform
    distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
@@ -343,9 +356,10 @@ def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):
            (with the shape [1], and the data type int32 or int64). If ``shape``
            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
            int64).
-        dtype(str|np.dtype, optional): The data type of
-            the output Tensor. Supported data types: float32, float64.
-            Default is float32.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
        min(float|int, optional): The lower bound on the range of random values
            to generate, ``min`` is included in the range. Default is -1.0.
        max(float|int, optional): The upper bound on the range of random values
@@ -401,6 +415,13 @@ def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):


    """
+    if dtype is None:
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "uniform only supports [float32, float64], but the default dtype is %s"
+                % dtype)
+
    if not isinstance(dtype, core.VarDesc.VarType):
        dtype = convert_np_dtype_to_dtype_(dtype)

@@ -447,7 +468,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
            (with the shape [1], and the data type int32 or int64). If ``shape``
            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
            int64). Default is [1].
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+        dtype(str|np.dtype, optional): The data type of the
            output tensor. Supported data types: int32, int64. If ``dytpe``
            is None, the data type is int64. Default is None.
        name(str, optional): The default value is None.  Normally there is no
@@ -550,7 +571,7 @@ def randperm(n, dtype="int64", name=None):

    Args:
        n(int): The upper bound (exclusive), and it should be greater than 0.
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
+        dtype(str|np.dtype, optional): The data type of
            the output Tensor. Supported data types: int32, int64, float32,
            float64. Default is int64.
        name(str, optional): The default value is None. Normally there is no
@@ -622,9 +643,10 @@ def rand(shape, dtype=None, name=None):
            (with the shape [1], and the data type int32 or int64). If ``shape``
            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
            int64).
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
-            output tensor. Supported data types: float32, float64. If ``dytpe``
-            is None, the data type is float32. Default is None.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
        name(str, optional): The default value is None. Normally there is no
            need for user to set this property. For more information, please
            refer to :ref:`api_guide_Name`.
@@ -668,7 +690,11 @@ def rand(shape, dtype=None, name=None):

    """
    if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "rand only supports [float32, float64], but the default dtype is %s"
+                % dtype)

    out = uniform(shape, dtype, min=0.0, max=1.0, name=name)
    out.stop_gradient = True

--- a/tools/summary_env.py
+++ b/tools/summary_env.py
@@ -55,7 +55,7 @@ def get_os_info():
    else:
        plat = None
        ver = None
-    envs['os_info'] = "{} {}".format(plat, ver)
+    envs['os_info'] = "{0} {1}".format(plat, ver)


 def get_python_info():
@@ -93,7 +93,7 @@ def get_cudnn_info():
        if cudnn_dll_path:
            cudnn_header_path = cudnn_dll_path.split('bin')[
                0] + 'include\cudnn.h'
-            cmd = 'type "{}" | findstr "{}" | findstr /v "CUDNN_VERSION"'
+            cmd = 'type "{0}" | findstr "{1}" | findstr /v "CUDNN_VERSION"'
        else:
            envs['cudnn_version'] = None
            return
@@ -102,7 +102,7 @@ def get_cudnn_info():
            'whereis "cudnn.h" | awk \'{print $2}\'')
        if cudnn_header_path:
            cudnn_header_path = cudnn_header_path.strip()
-            cmd = 'cat "{}" | grep "{}" | grep -v "CUDNN_VERSION"'
+            cmd = 'cat "{0}" | grep "{1}" | grep -v "CUDNN_VERSION"'
        else:
            envs['cudnn_version'] = None
            return
@@ -112,7 +112,7 @@ def get_cudnn_info():
    patch_level = _get_cudnn_ver(
        cmd.format(cudnn_header_path, 'CUDNN_PATCHLEVEL'))

-    envs['cudnn_version'] = "{}.{}.{}".format(major, minor, patch_level)
+    envs['cudnn_version'] = "{0}.{1}.{2}".format(major, minor, patch_level)


 def get_driver_info():
@@ -132,7 +132,7 @@ def main():
    get_cuda_info()
    get_cudnn_info()
    get_driver_info()
-    print(envs_template.format(**envs))
+    print('*' * 40 + envs_template.format(**envs) + '*' * 40)


 if __name__ == '__main__':