diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index d00e38784c2c0415a59a33fc24d708c253481c21..9bde9e20b19a0b14ce4489b91d9ab3d5273f7f9a 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -12,67 +12,122 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/generator.h"
+
+#include <glog/logging.h>
+
 #include <deque>
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 
-#include "paddle/fluid/framework/generator.h"
-
 namespace paddle {
 namespace framework {
 
-std::shared_ptr<Generator> Generator::gen_instance_ = NULL;
+const std::shared_ptr<Generator>& DefaultCPUGenerator() {
+  static auto default_cpu_generator =
+      std::make_shared<Generator>(GetRandomSeed());
+  VLOG(4) << "initial seed: " << default_cpu_generator->GetCurrentSeed()
+          << ", cpu engine: " << default_cpu_generator->GetCPUEngine().get();
+  return default_cpu_generator;
+}
+
+std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine() {
+  static auto op_default_cpu_engine = std::make_shared<std::mt19937_64>();
+  return op_default_cpu_engine;
+}
+
+// NOTE(zhiqiu): there are 3 conditions:
+// (1) op seed is not set and DefaultCPUGenerator is inited, use
+// DefaultCPUGenerator
+// (2) op seed is not set and DefaultCPUGenerator is not inited, use se
+// OpDefaultCPUEngine() and set a radnom seed
+// (3) op seed is set, use OpDefaultCPUEngine() and set the seed
+std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
+  if (DefaultCPUGenerator()->GetIsInitPy() && seed == 0) {
+    VLOG(4) << "Use random engine from generator";
+    return DefaultCPUGenerator()->GetCPUEngine();
+  } else {
+    // NOTE(zhiqiu): creating an engine instance everytime instead of using
+    // OpDefaultCPUEngine(), this is the legacy behavior of random operators.
+    // The benefit is that when runing PE with fixed-seed in multiple thrads,
+    // each thread has their own engine, and doesn't affect each other.
+    //
+    // And we need to measure the determinacy of Generator in PE.
+    auto engine = std::make_shared<std::mt19937_64>();
+    if (seed == 0) {
+      seed = GetRandomSeed();
+      VLOG(4) << "Use default random engine with random seed = " << seed;
+    } else {
+      VLOG(4) << "Use default random engine with fixed random seed = " << seed;
+    }
+    static std::mutex mu_;
+    {
+      std::lock_guard<std::mutex> lock(mu_);
+      engine->seed(seed);
+    }
+    return engine;
+  }
+}
 
-GeneratorState* Generator::GetState() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_.get();
+GeneratorState Generator::GetState() {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  state_.cpu_engine = *engine_;
+  return this->state_;
 }
 
-void Generator::SetState(GeneratorState* state_in) {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  *this->state_ = *state_in;
+void Generator::SetState(const GeneratorState& state) {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  this->state_ = state;
+  this->engine_ = std::make_shared<std::mt19937_64>(state.cpu_engine);
 }
 
 uint64_t Generator::GetCurrentSeed() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_->current_seed;
+  std::lock_guard<std::mutex> lock(this->mu_);
+  return this->state_.current_seed;
 }
 
 uint64_t Generator::Seed() {
-  std::lock_guard<std::mutex> lock(this->mutex);
+  std::lock_guard<std::mutex> lock(this->mu_);
   uint64_t seed;
   std::random_device de;
   seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF;
-  this->state_->current_seed = seed;
+  this->state_.current_seed = seed;
   std::seed_seq seq({seed});
-  this->state_->cpu_engine.seed(seq);
+  this->engine_->seed(seq);
 
-  return this->state_->current_seed;
+  return this->state_.current_seed;
 }
 
 void Generator::SetCurrentSeed(uint64_t seed) {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  this->state_->current_seed = uint64_t(seed);
+  std::lock_guard<std::mutex> lock(this->mu_);
+  this->state_.current_seed = seed;
   std::seed_seq seq({seed});
-  this->state_->cpu_engine.seed(seq);
+  this->engine_->seed(seq);
 }
 
-std::mt19937_64& Generator::GetCPUEngine() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_->cpu_engine;
+std::shared_ptr<std::mt19937_64> Generator::GetCPUEngine() {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  return this->engine_;
 }
 
-void Generator::SetCPUEngine(std::mt19937_64 engine) {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  this->state_->cpu_engine = std::mt19937_64(engine);
+void Generator::SetCPUEngine(std::shared_ptr<std::mt19937_64> engine) {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  this->engine_ = engine;
 }
 
 uint64_t Generator::Random64() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_->cpu_engine();
+  std::lock_guard<std::mutex> lock(this->mu_);
+  auto engine = this->engine_;
+  return (*engine)();
+}
+
+void Generator::SetIsInitPy(bool is_init_py) {
+  this->is_init_py_ = is_init_py;
+  VLOG(4) << "SetIsInitPy:" << this->is_init_py_;
 }
+bool Generator::GetIsInitPy() const { return this->is_init_py_; }
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index 17870782ba72a3247de734642962ffec48c0c91e..82b35f7ad550e770e8d10457ddf6cdf8e6fbd709 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <glog/logging.h>
 #include <stdint.h>
+
 #include <atomic>
 #include <deque>
 #include <iostream>  // temp for debug
@@ -27,6 +29,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+static uint64_t GetRandomSeed() {
+  std::random_device rd;
+  // double has 53 bit significant, so limit uint64 to 53 bits
+  return ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
+}
+
 struct GeneratorState {
   int64_t device = -1;
   uint64_t current_seed = 34342423252;
@@ -35,62 +43,67 @@ struct GeneratorState {
 
 struct Generator {
   Generator() {
-    GeneratorState default_gen_state_cpu;
-    default_gen_state_cpu.device = -1;
-    default_gen_state_cpu.current_seed = 34342423252;
-    std::seed_seq seq({34342423252});
-    default_gen_state_cpu.cpu_engine = std::mt19937_64(seq);
-    this->state_ = std::make_shared<GeneratorState>(default_gen_state_cpu);
+    auto seed = GetRandomSeed();
+    std::seed_seq seq({seed});
+    auto engine = std::make_shared<std::mt19937_64>(seq);
+    this->state_.cpu_engine = *engine;
+    this->state_.device = -1;
+    this->state_.current_seed = seed;
+    this->engine_ = engine;
+    VLOG(4) << "initial seed: " << this->state_.current_seed
+            << ", cpu engine: " << &this->state_.cpu_engine;
+  }
+  explicit Generator(uint64_t seed) {
+    std::seed_seq seq({seed});
+    auto engine = std::make_shared<std::mt19937_64>(seq);
+    this->state_.cpu_engine = *engine;
+    this->state_.device = -1;
+    this->state_.current_seed = seed;
+    this->engine_ = engine;
+    VLOG(4) << "initial seed: " << this->state_.current_seed
+            << ", cpu engine: " << &this->state_.cpu_engine;
+    this->is_init_py_ = true;  // TODO(zhiqiu): remove it in future
   }
-  explicit Generator(GeneratorState state_in)
-      : state_{std::make_shared<GeneratorState>(state_in)} {}
-  Generator(const Generator& other)
-      : Generator(other, std::lock_guard<std::mutex>(other.mutex)) {}
+  Generator(const Generator& other) = delete;
 
   // get random state
-  GeneratorState* GetState();
+  GeneratorState GetState();
   // set random state
-  void SetState(GeneratorState* state_in);
+  void SetState(const GeneratorState&);
   // get current seed
   uint64_t GetCurrentSeed();
   // random a seed and get
   uint64_t Seed();
-
   // set seed
   void SetCurrentSeed(uint64_t seed);
   // get cpu engine
-  std::mt19937_64& GetCPUEngine();
+  std::shared_ptr<std::mt19937_64> GetCPUEngine();
   // set cpu engine
-  void SetCPUEngine(std::mt19937_64 engine);
+  void SetCPUEngine(std::shared_ptr<std::mt19937_64>);
 
   uint64_t Random64();
 
-  bool is_init_py = false;
+  void SetIsInitPy(bool);
+  bool GetIsInitPy() const;
 
-  // CPU Generator singleton
-  static std::shared_ptr<Generator> GetInstance() {
-    if (NULL == gen_instance_) {
-      gen_instance_.reset(new paddle::framework::Generator());
-    }
-    return gen_instance_;
-  }
+ private:
+  GeneratorState state_;
+  std::shared_ptr<std::mt19937_64> engine_;
+  mutable std::mutex mu_;
+
+  // NOTE(zhiqiu): is_init_py_ is used to make generator be compatible with
+  // old seed, and it should be removed after all random-related operators
+  // and unittests upgrades to use generator.
+  bool is_init_py_ = false;
+};
 
-  static std::shared_ptr<Generator> GetInstanceX() {
-    if (NULL == gen_instance_) {
-      gen_instance_.reset(new paddle::framework::Generator());
-    }
-    gen_instance_->is_init_py = true;
-    return gen_instance_;
-  }
+// The DefaultCPUGenerator is used in manual_seed()
+const std::shared_ptr<Generator>& DefaultCPUGenerator();
 
- private:
-  static std::shared_ptr<Generator> gen_instance_;
-  std::shared_ptr<GeneratorState> state_;
-  mutable std::mutex mutex;
+// If op seed is set or global is not set, the OpDefaultCPUEngine is used.
+std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine();
 
-  Generator(const Generator& other, const std::lock_guard<std::mutex>&)
-      : state_(std::make_shared<GeneratorState>(*(other.state_))) {}
-};
+std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index 079fb1479861ca0840b47470339f2f7a5b6bffa8..b50b4f37caecd8d8d5c393ee3a5c5b76c1f406be 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
+#include <cmath>
 #include <functional>
 #include <string>
 #include <vector>
@@ -74,12 +75,17 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
   auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
   auto weights_shape = weights->dims();
   auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
+  auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
 
-  EigenMatrixArrayMap weights_array_2d(
-      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
-      weights_shape_2d[1]);
+  EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
+                                       weights_shape_2d[1]);
 
   weights_array_2d.colwise() *= scale_array;
+
+  // Check for subnormal values that slows down convolution execution
+  for (int i = 0; i < weights->numel(); ++i) {
+    if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
+  }
 }
 
 void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
@@ -108,13 +114,6 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
 
     GET_CONV_BN_NODES(conv_ac_pattern);
 
-    // check if fuse can be done and if MKL-DNN should be used
-    FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
-    if (fuse_option == DO_NOT_FUSE) {
-      VLOG(3) << "do not perform conv+affinechannel fuse";
-      return;
-    }
-
     // Create eltwise_y (conv bias) variable
     VarDesc eltwise_y_in_desc(
         patterns::PDNodeName(name_scope_, "eltwise_y_in"));
@@ -143,6 +142,7 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
     desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
     desc.SetType("elementwise_add");
     desc.SetAttr("axis", 1);
+    desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
     auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
 
     GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index a91f14e56b719515bfd4d07896648e596a2282dd..9ad30506b2c3a0fac16d29c3bbee07725ff3d95d 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -36,7 +36,6 @@ namespace paddle {
 namespace imperative {
 
 void BasicEngine::Init(VarBase* var, bool retain_graph) {
-  sorted_sum_gradient_ = FLAGS_sort_sum_gradient;
   retain_graph_ = retain_graph;
   init_node_ = var->GradVarBase()->GradNode();
   var->GradVarBase()->ClearGradNode();
@@ -106,7 +105,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) {
 
       auto& accumulator = accumulators_[var.get()];
       if (!accumulator) {
-        if (sorted_sum_gradient_) {
+        if (FLAGS_sort_sum_gradient) {
           accumulator.reset(new SortedGradientAccumulator(var.get()));
         } else {
           accumulator.reset(new EagerGradientAccumulator(var.get()));
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index d1aa69f16868d3bcc67458330594dd149564c0bf..0906dd4f9236ecf26ef30395aa551b57e4e43b75 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -44,7 +44,6 @@ class BasicEngine : public Engine {
 
  private:
   std::shared_ptr<GradOpNode> init_node_;
-  bool sorted_sum_gradient_;
   std::unordered_map<GradOpNode*, size_t> node_deps_;
   std::unordered_map<VariableWrapper*, std::unique_ptr<GradientAccumulator>>
       accumulators_;
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 3afe5af7f6348654c4cad3d44952cef43ba93f7e..5c717835e5cc2042a7a3fdd8c51aa6eeff1fc523 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -578,7 +578,6 @@ class PartialGradTask {
   bool retain_graph_;
   bool allow_unused_;
   bool only_inputs_;
-  bool sorted_sum_gradient_{FLAGS_sort_sum_gradient};
 };
 
 PartialGradTask::PartialGradTask(
@@ -981,7 +980,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) {
 
       if (!accumulator) {
         accumulator.reset(new GradientAccumulationInfo(
-            var, sorted_sum_gradient_, create_graph_));
+            var, FLAGS_sort_sum_gradient, create_graph_));
       }
 
       accumulator->IncreaseTotalRefCnt();
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index a1c1e6de5fd44617a30f235a0416d897bf932075..9fbc97d55090345af3b3b12bcd138bfaecd346cc 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
@@ -103,8 +102,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
                                   // params_file_ fields.
 
   CP_MEMBER(opt_cache_dir_);
-  prog_file_ = std::move(other.prog_file_);
-  params_file_ = std::move(other.params_file_);
+  CP_MEMBER(prog_file_);
+  CP_MEMBER(params_file_);
 
   CP_MEMBER(use_fc_padding_);
   // GPU related.
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index cd396159f2c3de04e99a0f2ff7b9dc0c7b3628fa..bdde116dd3e046e03d200909b77c340aebe8db2c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -32,7 +32,6 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -517,6 +516,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
+  // TODO(NHZlX): Should add the link to the doc of
+  // paddle_infer::CreatePredictor<paddle_infer::Config>
   if (config.glog_info_disabled()) {
     FLAGS_logtostderr = 1;
     FLAGS_minloglevel = 2;  // GLOG_ERROR
@@ -1059,3 +1060,122 @@ USE_TRT_CONVERTER(slice);
 USE_TRT_CONVERTER(scale);
 USE_TRT_CONVERTER(stack);
 #endif
+
+namespace paddle_infer {
+
+void Tensor::Reshape(const std::vector<int> &shape) { tensor_->Reshape(shape); }
+
+std::vector<int> Tensor::shape() const { return tensor_->shape(); }
+
+void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+  return tensor_->SetLoD(x);
+}
+
+std::vector<std::vector<size_t>> Tensor::lod() const { return tensor_->lod(); }
+
+const std::string &Tensor::name() const { return tensor_->name(); }
+
+DataType Tensor::type() const { return tensor_->type(); }
+
+Predictor::Predictor(const Config &config) {
+  const_cast<Config *>(&config)->SwitchUseFeedFetchOps(false);
+  // The second parameter indicates that the discard log is not printed
+  predictor_ = paddle::CreatePaddlePredictor<
+      Config, paddle::PaddleEngineKind::kAnalysis>(config);
+}
+
+std::vector<std::string> Predictor::GetInputNames() {
+  return predictor_->GetInputNames();
+}
+
+std::unique_ptr<Tensor> Predictor::GetInputHandle(const std::string &name) {
+  auto zero_copy_tensor = predictor_->GetInputTensor(name);
+  std::unique_ptr<Tensor> tensor(new Tensor(std::move(zero_copy_tensor)));
+  return tensor;
+}
+
+std::vector<std::string> Predictor::GetOutputNames() {
+  return predictor_->GetOutputNames();
+}
+
+std::unique_ptr<Tensor> Predictor::GetOutputHandle(const std::string &name) {
+  auto zero_copy_tensor = predictor_->GetOutputTensor(name);
+  std::unique_ptr<Tensor> tensor(new Tensor(std::move(zero_copy_tensor)));
+  return tensor;
+}
+
+bool Predictor::Run() { return predictor_->ZeroCopyRun(); }
+
+std::unique_ptr<Predictor> Predictor::Clone() {
+  auto analysis_pred = predictor_->Clone();
+  std::unique_ptr<Predictor> pred(new Predictor(std::move(analysis_pred)));
+  return pred;
+}
+
+void Predictor::ClearIntermediateTensor() {
+  predictor_->ClearIntermediateTensor();
+}
+
+int GetNumBytesOfDataType(DataType dtype) {
+  switch (dtype) {
+    case DataType::FLOAT32:
+      return sizeof(float);
+    case DataType::INT64:
+      return sizeof(int64_t);
+    case DataType::INT32:
+      return sizeof(int32_t);
+    case DataType::UINT8:
+      return sizeof(uint8_t);
+    default:
+      assert(false);
+      return -1;
+  }
+}
+
+std::string GetVersion() { return paddle::get_version(); }
+
+std::string UpdateDllFlag(const char *name, const char *value) {
+  return paddle::UpdateDllFlag(name, value);
+}
+
+}  // namespace paddle_infer
+
+namespace paddle_infer {
+std::shared_ptr<Predictor> CreatePredictor(const Config &config) {  // NOLINT
+  std::shared_ptr<Predictor> predictor(new Predictor(config));
+  return predictor;
+}
+
+namespace services {
+PredictorPool::PredictorPool(const Config &config, size_t size) {
+  PADDLE_ENFORCE_GE(
+      size, 1UL,
+      paddle::platform::errors::InvalidArgument(
+          "The predictor pool size should be greater than 1, but it's (%d)",
+          size));
+  Config copy_config(config);
+  main_pred_.reset(new Predictor(config));
+  for (size_t i = 0; i < size - 1; i++) {
+    if (config.tensorrt_engine_enabled()) {
+      Config config_tmp(copy_config);
+      preds_.push_back(
+          std::move(std::unique_ptr<Predictor>(new Predictor(config_tmp))));
+    } else {
+      preds_.push_back(std::move(main_pred_->Clone()));
+    }
+  }
+}
+
+Predictor *PredictorPool::Retrive(size_t idx) {
+  PADDLE_ENFORCE_LT(
+      idx, preds_.size() + 1,
+      paddle::platform::errors::InvalidArgument(
+          "There are (%d) predictors in the pool, but the idx is (%d)", idx,
+          preds_.size() + 1));
+  if (idx == 0) {
+    return main_pred_.get();
+  }
+  return preds_[idx - 1].get();
+}
+}  // namespace services
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 458eecfeea6ff27c96a8864ba8a08a9e5c587df5..2f608da531f25e1a5665744f7e9a2968cc9d0d64 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -112,6 +112,12 @@ void PaddleBuf::Free() {
   }
 }
 
+NativeConfig::NativeConfig() {
+  LOG(WARNING) << "The paddle::NativeConfig interface is going to be "
+                  "deprecated in the next release, plase use the latest "
+                  "paddle_infer::Config instead.";
+}
+
 std::string get_version() {
   std::stringstream ss;
   ss << "version: " << framework::paddle_version() << "\n";
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 3d5b40c93dad071052217677e387ba54011fb666..07d6dcf86e9814e5bfc932d8320b549d55fe88ae 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <set>
 #include <sstream>
 #include <string>
@@ -25,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -311,6 +313,8 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
+  // TODO(NHZlX): Should add the link to the doc of
+  // paddle_infer::CreatePredictor<paddle_infer::Config>
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memory
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 386d20103a71acb34cd47ddf5527f580cc5bf5b1..064f63542683a0d95985382385b182d794da0068 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -347,6 +347,7 @@ class PD_INFER_DECL PaddlePredictor {
 /// place of inference, etc.)
 ///
 struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
+  NativeConfig();
   /// GPU related fields.
   bool use_gpu{false};
   int device{0};
@@ -421,7 +422,8 @@ enum class PaddleEngineKind {
 };
 
 template <typename ConfigT, PaddleEngineKind engine>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
+    const ConfigT& config);
 
 template <>
 PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
@@ -437,6 +439,4 @@ PD_INFER_DECL std::string get_version();
 
 PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
 
-PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
-    const std::string& config_file);
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 6f30ad95f168cebe9702c90fbd2cca2c79a0e83f..da5d7411693c92eaa2066c7f76d56970f8939bc7 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -22,9 +22,124 @@ limitations under the License. */
 #pragma once
 
 #include <cassert>
+#include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "paddle_analysis_config.h"  // NOLINT
 #include "paddle_api.h"              // NOLINT
+
+namespace paddle_infer {
+using DataType = paddle::PaddleDType;
+using PlaceType = paddle::PaddlePlace;
+using PrecisionType = paddle::AnalysisConfig::Precision;
+using Config = paddle::AnalysisConfig;
+
+class PD_INFER_DECL Tensor {
+ public:
+  // Can only be created by predictor->GetInputHandle(cosnt std::string& name)
+  // or predictor->GetOutputHandle(cosnt std::string& name)
+  Tensor() = delete;
+  explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
+      : tensor_(std::move(tensor)) {}
+  void Reshape(const std::vector<int>& shape);
+
+  template <typename T>
+  void CopyFromCpu(const T* data);
+
+  // should add the place
+  template <typename T>
+  T* mutable_data(PlaceType place);
+
+  template <typename T>
+  void CopyToCpu(T* data);
+
+  template <typename T>
+  T* data(PlaceType* place, int* size) const;
+
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  std::vector<std::vector<size_t>> lod() const;
+
+  DataType type() const;
+
+  std::vector<int> shape() const;
+  const std::string& name() const;
+
+ private:
+  std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
+};
+
+class PD_INFER_DECL Predictor {
+ public:
+  Predictor() = default;
+  ~Predictor() {}
+  // Use for clone
+  explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
+      : predictor_(std::move(pred)) {}
+
+  explicit Predictor(const Config& config);
+
+  std::vector<std::string> GetInputNames();
+  std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
+
+  bool Run();
+
+  std::vector<std::string> GetOutputNames();
+  std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
+
+  std::unique_ptr<Predictor> Clone();
+  void ClearIntermediateTensor();
+
+ private:
+  std::unique_ptr<paddle::PaddlePredictor> predictor_;
+};
+
+PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
+    const Config& config);  // NOLINT
+PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
+
+PD_INFER_DECL std::string GetVersion();
+PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
+
+template <typename T>
+void Tensor::CopyFromCpu(const T* data) {
+  tensor_->copy_from_cpu<T>(data);
+}
+
+template <typename T>
+void Tensor::CopyToCpu(T* data) {
+  return tensor_->copy_to_cpu<T>(data);
+}
+
+template <typename T>
+T* Tensor::mutable_data(PlaceType place) {
+  return tensor_->mutable_data<T>(place);
+}
+
+template <typename T>
+T* Tensor::data(PlaceType* place, int* size) const {
+  return tensor_->data<T>(place, size);
+}
+
+}  // namespace paddle_infer
+
+namespace paddle_infer {
+namespace services {
+
+class PD_INFER_DECL PredictorPool {
+ public:
+  PredictorPool() = delete;
+  PredictorPool(const PredictorPool&) = delete;
+  PredictorPool& operator=(const PredictorPool&) = delete;
+
+  explicit PredictorPool(const Config& config, size_t size = 1);
+  Predictor* Retrive(size_t idx);
+
+ private:
+  std::shared_ptr<Predictor> main_pred_;
+  std::vector<std::unique_ptr<Predictor>> preds_;
+};
+}  // namespace services
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index ffb70700b5f98a51b579a68f746ea1ee6a6d9f7b..98a36a3308dc539ee5aecad9e71f50be310e584c 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -185,12 +185,14 @@ void CpuPassStrategy::EnableMKLDNN() {
     passes_.insert(passes_.begin(), "mkldnn_placement_pass");
 
     for (auto &pass : std::vector<std::string>({
-             "depthwise_conv_mkldnn_pass",    //
-             "conv_bn_fuse_pass",             // Execute BN passes again to
-             "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
-             "conv_transpose_bn_fuse_pass",   //
-             "conv_transpose_eltwiseadd_bn_fuse_pass",  //
-             "conv_bias_mkldnn_fuse_pass",              //
+             "depthwise_conv_mkldnn_pass",     //
+             "conv_bn_fuse_pass",              // Execute BN passes again to
+             "conv_eltwiseadd_bn_fuse_pass",   // preserve correct pass order
+             "conv_affine_channel_fuse_pass",  //
+             "conv_eltwiseadd_affine_channel_fuse_pass",  //
+             "conv_transpose_bn_fuse_pass",               //
+             "conv_transpose_eltwiseadd_bn_fuse_pass",    //
+             "conv_bias_mkldnn_fuse_pass",                //
              "conv_transpose_bias_mkldnn_fuse_pass",
              "conv3d_bias_mkldnn_fuse_pass",  //
              "conv_elementwise_add_mkldnn_fuse_pass",
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 814deda6729278e2e9f9e76ff83bbdd4966821c1..2bd30bc05179e2881c4ecb321d76d5506233cc0e 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -515,3 +515,9 @@ if(WITH_MKLDNN)
 inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc 
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
         ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
+
+if(WITH_GPU)
+  inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${RESNET50_MODEL_DIR})
+endif()
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index 5840a4c42b3b1065410dc1509cf0cee2480bd596..31701c59ec33dfced5745f7f16d8f00ffce462ef 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -72,3 +72,59 @@ TEST(AnalysisPredictor, use_gpu) {
 
 }  // namespace inference
 }  // namespace paddle
+
+namespace paddle_infer {
+
+TEST(Predictor, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  Config config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableLiteEngine(PrecisionType::kFloat32);
+
+  auto predictor = CreatePredictor(config);
+  const int batch = 1;
+  const int channel = 3;
+  const int height = 318;
+  const int width = 318;
+  const int input_num = batch * channel * height * width;
+  std::vector<float> input(input_num, 1);
+
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+
+  input_t->Reshape({1, 3, 318, 318});
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  size_t out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                   std::multiplies<int>());
+
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+
+  const std::vector<float> truth_values = {
+      127.780396f, 738.16656f,  1013.2264f,  -438.17206f, 366.4022f,
+      927.66187f,  736.2241f,   -633.68567f, -329.92737f, -430.15637f,
+      -633.0639f,  -146.54858f, -1324.2804f, -1349.3661f, -242.67671f,
+      117.44864f,  -801.7251f,  -391.51495f, -404.8202f,  454.16132f,
+      515.48206f,  -133.03114f, 69.293076f,  590.09753f,  -1434.6917f,
+      -1070.8903f, 307.0744f,   400.52573f,  -316.12177f, -587.1265f,
+      -161.05742f, 800.3663f,   -96.47157f,  748.708f,    868.17645f,
+      -447.9403f,  112.73656f,  1127.1992f,  47.43518f,   677.7219f,
+      593.1881f,   -336.4011f,  551.3634f,   397.82474f,  78.39835f,
+      -715.4006f,  405.96988f,  404.25684f,  246.01978f,  -8.430191f,
+      131.36617f,  -648.0528f};
+
+  float* data_o = out_data.data();
+  for (size_t j = 0; j < out_num; j += 10) {
+    EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0.,
+                10e-5);
+  }
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fee7c35581d3293f0036360b64961910d9eb02a7
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda_runtime.h>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cstring>
+#include <numeric>
+
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+
+namespace paddle_infer {
+
+TEST(Predictor, use_gpu) {
+  LOG(INFO) << GetVersion();
+  UpdateDllFlag("conv_workspace_size_limit", "4000");
+  std::string model_dir = FLAGS_infer_model + "/model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableUseGpu(100, 0);
+
+  auto predictor = CreatePredictor(config);
+  auto pred_clone = predictor->Clone();
+
+  std::vector<int> in_shape = {1, 3, 318, 318};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+
+  std::vector<float> input(in_num, 0);
+
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+  predictor->ClearIntermediateTensor();
+}
+
+TEST(PredictorPool, basic) {
+  LOG(INFO) << GetVersion();
+  UpdateDllFlag("conv_workspace_size_limit", "4000");
+  std::string model_dir = FLAGS_infer_model + "/model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableUseGpu(100, 0);
+
+  services::PredictorPool pred_pool(config, 4);
+  auto pred = pred_pool.Retrive(2);
+
+  std::vector<int> in_shape = {1, 3, 318, 318};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+  std::vector<float> input(in_num, 0);
+
+  auto in_names = pred->GetInputNames();
+  auto input_t = pred->GetInputHandle(in_names[0]);
+  input_t->name();
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  pred->Run();
+  auto out_names = pred->GetOutputNames();
+  auto output_t = pred->GetOutputHandle(out_names[0]);
+  auto out_type = output_t->type();
+  LOG(INFO) << GetNumBytesOfDataType(out_type);
+  if (out_type == DataType::FLOAT32) {
+    PlaceType place;
+    int size;
+    output_t->data<float>(&place, &size);
+  }
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
index 8ffa3efdf0556bd7cde7efa615f60853ad18d903..c7c7356b6e8831bc0bcd0e9ea4ad0fbdec8b6be2 100644
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
@@ -41,7 +41,7 @@ TEST(AnalysisPredictor, use_gpu) {
   SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
 
   std::vector<PaddleTensor> outputs;
-  for (auto& input : inputs_all) {
+  for (auto &input : inputs_all) {
     ASSERT_TRUE(predictor->Run(input, &outputs));
     predictor->ClearIntermediateTensor();
   }
@@ -49,3 +49,27 @@ TEST(AnalysisPredictor, use_gpu) {
 
 }  // namespace inference
 }  // namespace paddle
+
+namespace paddle_infer {
+TEST(PredictorPool, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  Config config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
+  config.EnableTensorRtEngine();
+  services::PredictorPool pred_pool(config, 1);
+
+  auto predictor = pred_pool.Retrive(0);
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  std::vector<int> in_shape = {1, 3, 224, 224};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+
+  std::vector<float> input(in_num, 0);
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/operators/bernoulli_op.cc b/paddle/fluid/operators/bernoulli_op.cc
index c525da5953d76d4406fbdd0d9d6e98619e409f71..79c4e2c2bba3191535f53e2ef2a32cd66e36230c 100644
--- a/paddle/fluid/operators/bernoulli_op.cc
+++ b/paddle/fluid/operators/bernoulli_op.cc
@@ -64,11 +64,11 @@ class BernoulliOpKernel<platform::CPUDeviceContext, T>
 
     int64_t size = x->numel();
     std::uniform_real_distribution<T> dist(0.0, 1.0);
-    auto gen_ptr = framework::Generator::GetInstance();
-    std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
+    auto gen_ptr = framework::DefaultCPUGenerator();
+    auto engine = gen_ptr->GetCPUEngine();
 
     for (int64_t i = 0; i < size; ++i) {
-      out_data[i] = BernoulliFunctor(in_data[i], dist(gen_engine));
+      out_data[i] = BernoulliFunctor(in_data[i], dist(*engine));
     }
   }
 };  // namespace operators
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 25e887ba6675e6c28bcd44c3b57c2ea571c075e3..7e0e77214c5320aa9a807fc65531f163fa7ce09e 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -567,3 +568,14 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
+
+REGISTER_OP_VERSION(conv_transpose)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade convtranspose add a new attribute [output_padding].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "output_padding",
+            "In order to add additional size to one side of each dimension "
+            "in the output",
+            {}));
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h
index 0d7032e286caab93dbd38f35881e9064694a8307..9e39e68cba779de4dc598046e45f7d35e292bb79 100644
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ b/paddle/fluid/operators/distributed/large_scale_kv.h
@@ -14,20 +14,19 @@
 
 #pragma once
 
+#include <ThreadPool.h>
 #include <gflags/gflags.h>
 
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
 #include <string>
+#include <thread>  // NOLINT
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
-#include <thread>  // NOLINT
-
-#include <ThreadPool.h>
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/rw_lock.h"
@@ -89,26 +88,17 @@ class UniformInitializer : public Initializer {
     min_ = std::stof(attrs[2]);
     max_ = std::stof(attrs[3]);
 
-    if (seed_ == 0) {
-      seed_ = std::random_device()();
-    }
-
-    random_engine_.seed(seed_);
     dist_ = std::uniform_real_distribution<float>(min_, max_);
+    random_engine_ = framework::GetCPURandomEngine(seed_);
   }
 
-  float GetValue() override {
-    return framework::Generator::GetInstance()->is_init_py
-               ? dist_(framework::Generator::GetInstance()->GetCPUEngine())
-               : dist_(random_engine_);
-    // return dist_(random_engine_);
-  }
+  float GetValue() override { return dist_(*random_engine_); }
 
  private:
   float min_;
   float max_;
 
-  std::minstd_rand random_engine_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
   std::uniform_real_distribution<float> dist_;
 };
 
@@ -139,26 +129,18 @@ class GaussianInitializer : public Initializer {
     mean_ = std::stof(attrs[2]);
     std_ = std::stof(attrs[3]);
 
-    if (seed_ == 0) {
-      seed_ = std::random_device()();
-    }
+    random_engine_ = framework::GetCPURandomEngine(seed_);
 
-    random_engine_.seed(seed_);
     dist_ = std::normal_distribution<float>(mean_, std_);
   }
 
-  float GetValue() override {
-    return framework::Generator::GetInstance()->is_init_py
-               ? dist_(framework::Generator::GetInstance()->GetCPUEngine())
-               : dist_(random_engine_);
-    // return dist_(random_engine_);
-  }
+  float GetValue() override { return dist_(*random_engine_); }
 
  private:
   float std_;
   float mean_;
 
-  std::minstd_rand random_engine_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
   std::normal_distribution<float> dist_;
 };
 
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index bce4c7ca19a603fd2eadaff7f82b5cdec91bb79f..9d9eb4a82a075f27764a73d0e976dbf3f7181cb1 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -55,30 +55,22 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
         std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
         return;
       }
-
-      bool init_generator_py = framework::Generator::GetInstance()->is_init_py;
-
+      // std::minstd_rand engine;
       // NOTE: fixed seed should only be used in unittest or for debug.
       // Guarantee to use random seed in training.
-      std::random_device rnd;
-      std::minstd_rand engine;
-      int seed_data;
+      int seed_data = 0;
       if (seed) {
         seed_data = *(seed->data<int>());
       } else {
         seed_data =
-            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : 0;
       }
-      engine.seed(seed_data);
+      auto engine = framework::GetCPURandomEngine(seed_data);
 
       std::uniform_real_distribution<float> dist(0, 1);
 
       for (size_t i = 0; i < size; ++i) {
-        float cur_random =
-            init_generator_py
-                ? dist(framework::Generator::GetInstance()->GetCPUEngine())
-                : dist(engine);
-        if (cur_random < dropout_prob) {
+        if (dist(*engine) < dropout_prob) {
           mask_data[i] = 0;
           y_data[i] = 0;
         } else {
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 111d4ad4490074fb53671f6f3180cf17c5abe913..4f128463375b91803a7a4d02a27dd78157961aac 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -39,26 +39,14 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
     tensor->Resize(shape);
     int64_t size = tensor->numel();
     T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
 
-    if (framework::Generator::GetInstance()->is_init_py) {
-      std::mt19937_64& gen_engine =
-          framework::Generator::GetInstance()->GetCPUEngine();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(gen_engine);
-      }
-    } else {
-      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-      std::minstd_rand engine;
-      if (seed == 0) {
-        seed = std::random_device()();
-      }
-      engine.seed(seed);
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
     }
   }
-};
+};  // namespace operators
 
 template <typename T>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 86feaa72d5fa69cd5d76e56182c27b8d048e4c74..a4bdc923eecc3e1283e642e592f91d658e7c9aa7 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sampler.h"
+
 #include <glog/logging.h>
+
 #include <iostream>
 #include <queue>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/generator.h"
 
 namespace paddle {
@@ -28,22 +31,17 @@ Sampler::~Sampler() {}
 
 UniformSampler::UniformSampler(int64_t range, unsigned int seed)
     : Sampler(range, seed), inv_range_(1.0 / (range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
+  random_engine_ = framework::GetCPURandomEngine(seed_);
   dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
 }
 
-int64_t UniformSampler::Sample() const {
-  return framework::Generator::GetInstance()->is_init_py
-             ? (*dist_)(framework::Generator::GetInstance()->GetCPUEngine())
-             : (*dist_)(*random_engine_);
-  // return (*dist_)(*random_engine_);
-}
+int64_t UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
 
 float UniformSampler::Probability(int64_t value) const { return inv_range_; }
 
 LogUniformSampler::LogUniformSampler(int64_t range, unsigned int seed)
     : Sampler(range, seed), log_range_(log(range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
+  random_engine_ = framework::GetCPURandomEngine(seed_);
   dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
 }
 
@@ -52,10 +50,7 @@ int64_t LogUniformSampler::Sample() const {
   // inverse_transform_sampling method
   // More details:
   // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
-  auto cur_random =
-      framework::Generator::GetInstance()->is_init_py
-          ? (*dist_)(framework::Generator::GetInstance()->GetCPUEngine())
-          : (*dist_)(*random_engine_);
+  auto cur_random = (*dist_)(*random_engine_);
   const int64_t value = static_cast<int64_t>(exp(cur_random * log_range_)) - 1;
   // Mathematically, value should be <= range_, but might not be due to some
   // floating point roundoff, so we mod by range_.
@@ -74,7 +69,7 @@ CustomSampler::CustomSampler(int64_t range, const float *probabilities,
                              const int *alias, const float *alias_probabilities,
                              unsigned int seed)
     : Sampler(range, seed) {
-  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  random_engine_ = framework::GetCPURandomEngine(seed_);
   real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
   int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
 
@@ -84,14 +79,8 @@ CustomSampler::CustomSampler(int64_t range, const float *probabilities,
 }
 
 int64_t CustomSampler::Sample() const {
-  auto index =
-      framework::Generator::GetInstance()->is_init_py
-          ? (*int_dist_)(framework::Generator::GetInstance()->GetCPUEngine())
-          : (*int_dist_)(*random_engine_);
-  auto p =
-      framework::Generator::GetInstance()->is_init_py
-          ? (*real_dist_)(framework::Generator::GetInstance()->GetCPUEngine())
-          : (*real_dist_)(*random_engine_);
+  auto index = (*int_dist_)(*random_engine_);
+  auto p = (*real_dist_)(*random_engine_);
   if (p > alias_probs_[index]) {
     int alias = alias_[index];
 
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 3fa5a7ae336a9be984324411b88570aea99c2c78..480576ef9dc8c21811a1a867d553ccc6d97fa22a 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -26,8 +26,8 @@ namespace math {
 // TODO(wanghaoshuang): Support for GPU
 
 /**
-* Sample integers from [0, range).
-*/
+ * Sample integers from [0, range).
+ */
 class Sampler {
  public:
   explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
@@ -117,7 +117,7 @@ class CustomSampler : public Sampler {
   const int* alias_;
   const float* probs_;
   const int exceptional_val = -1;
-  std::shared_ptr<std::mt19937> random_engine_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
   std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
   std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
 };
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index a50cc22e5bb0def54b057dcc23d2f6751eecc478..40737f4cd029b47dbd03069a2e4d29ad33121eb9 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -44,6 +44,7 @@ class FCPrimitiveFactory {
 
   void ExecuteFcPrimitive(const LoDTensor* input, const Tensor* weights,
                           const Tensor* bias, LoDTensor* output,
+                          const MKLDNNDeviceContext& dev_ctx,
                           const ExecutionContext& ctx) {
     RecomputeOutputDims(ctx, input, weights, output);
     // If primitive has already been created and cached, don't create new one,
@@ -74,8 +75,8 @@ class FCPrimitiveFactory {
               "input format is equal to ncw."));
     }
 
-    // Transform weights to default MKL-DNN format
-    weights_ = TransposeWeights(weights);
+    weights_ = CreateWeightsMemory(weights);
+
     // Since MKL-DNN has a lot of limitations on what the input/weights/output
     // dimensions should be, to simplify the code, the creation of primitive
     // descriptor has been divided into separate cases, based on the number
@@ -112,10 +113,13 @@ class FCPrimitiveFactory {
     // Quantize weights and reorder to format chosen by FC primitive descriptor.
     QuantizeWeights(ctx, fc_prim_desc->weights_desc());
 
-    bias_ = CreateMemory<float>(fc_prim_desc->bias_desc(), bias);
+    bias_ = CreateMemoryToBeCached<float>(fc_prim_desc->bias_desc(), bias);
     // If int8 is desired, quantize bias into 32-bit signed int
     QuantizeBias(*fc_prim_desc, ctx);
 
+    // Store weights and bias in the mkldnn cache
+    CacheWeightsAndBias(dev_ctx, ctx);
+
     // Based on format determined by inner_product, create output in desired
     // memory format
     output_ = CreateDstMemory(*fc_prim_desc, ctx, output);
@@ -262,14 +266,15 @@ class FCPrimitiveFactory {
   }
 
   // Convert data from one data format to another
-  mkldnn::memory Reorder(const memory::desc& src_desc,
-                         const memory::desc& dst_desc, void* src_data) {
+  std::shared_ptr<mkldnn::memory> Reorder(const memory::desc& src_desc,
+                                          const memory::desc& dst_desc,
+                                          void* src_data) {
     auto src_mem = memory(src_desc, engine_, src_data);
-    auto dst_mem = memory(dst_desc, engine_);
+    auto dst_mem = std::make_shared<memory>(dst_desc, engine_);
 
-    auto reorder = mkldnn::reorder(src_mem, dst_mem);
+    auto reorder = mkldnn::reorder(src_mem, *dst_mem);
     mkldnn::stream astream(engine_);
-    reorder.execute(astream, src_mem, dst_mem);
+    reorder.execute(astream, src_mem, *dst_mem);
     astream.wait();
 
     return dst_mem;
@@ -277,9 +282,10 @@ class FCPrimitiveFactory {
 
   // Convert data from one data format to another and rescale it.
   // If the desired data type is (un)signed int8, quantization occurs here.
-  mkldnn::memory Reorder(const memory& src_mem, const memory::desc& dst_md,
-                         const std::vector<float>& scale_data) {
-    mkldnn::memory dst_mem = mkldnn::memory(dst_md, engine_);
+  std::shared_ptr<mkldnn::memory> ReorderWithScale(
+      const std::shared_ptr<memory> src_mem, const memory::desc& dst_md,
+      const std::vector<float>& scale_data) {
+    auto dst_mem = std::make_shared<mkldnn::memory>(dst_md, engine_);
     mkldnn::primitive_attr attributes;
     // According to MKL-DNN's documentation mask determines along which
     // dimensions should the scale be applied.
@@ -289,11 +295,11 @@ class FCPrimitiveFactory {
     //     becuase we perform per-output-channel quantization
     int mask = CreateMask(0, scale_data.size() > 1);
     attributes.set_output_scales(mask, scale_data);
-    auto reorder = mkldnn::reorder(src_mem, dst_mem, attributes);
+    auto reorder = mkldnn::reorder(*src_mem, *dst_mem, attributes);
 
     mkldnn::stream astream(engine_);
     reorder.execute(astream,
-                    {{MKLDNN_ARG_FROM, src_mem}, {MKLDNN_ARG_TO, dst_mem}});
+                    {{MKLDNN_ARG_FROM, *src_mem}, {MKLDNN_ARG_TO, *dst_mem}});
     astream.wait();
 
     return dst_mem;
@@ -323,16 +329,38 @@ class FCPrimitiveFactory {
     return memory(desc, engine_, data);
   }
 
-  // Transpose weights through MKL-DNN's reorder from io to oi format.
-  mkldnn::memory TransposeWeights(const Tensor* weights) {
+  template <typename T>
+  std::shared_ptr<mkldnn::memory> CreateMemoryToBeCached(
+      const mkldnn::memory::desc& desc, const Tensor* tensor) {
+    return CreateMemoryToBeCached(desc,
+                                  platform::to_void_cast<T>(tensor->data<T>()));
+  }
+
+  std::shared_ptr<mkldnn::memory> CreateMemoryToBeCached(
+      const mkldnn::memory::desc& desc, void* data) {
+    return std::make_shared<memory>(desc, engine_, data);
+  }
+
+  // Create weights memory and transform to default MKL-DNN format
+  std::shared_ptr<mkldnn::memory> CreateWeightsMemory(const Tensor* weights) {
     auto dims = framework::vectorize(weights->dims());
     std::swap(dims[0], dims[1]);  // Correct output dimensions
     auto src_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::io);
     auto dst_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oi);
+    // Transpose weights through MKL-DNN's reorder from io to oi format.
     return Reorder(src_desc, dst_desc,
                    platform::to_void_cast<float>(weights->data<float>()));
   }
 
+  void CacheWeightsAndBias(const MKLDNNDeviceContext& dev_ctx,
+                           const ExecutionContext& ctx) {
+    const std::string key = platform::CreateKey(platform::ThreadIDasStr());
+    const std::string weights_key = key + ctx.InputName("W");
+    const std::string bias_key = key + ctx.InputName("Bias");
+    dev_ctx.SetBlob(weights_key, weights_);
+    dev_ctx.SetBlob(bias_key, bias_);
+  }
+
   // Compute the bias scales so that its values correspond to the
   // scale of data being an output of weights and input multiplication
   std::vector<float> ComputeBiasScales(const ExecutionContext& ctx) {
@@ -388,14 +416,14 @@ class FCPrimitiveFactory {
   }
 
   void QuantizeWeights(const ExecutionContext& ctx, memory::desc dst) {
-    weights_ =
-        Reorder(*weights_, dst, ctx.Attr<std::vector<float>>("Scale_weights"));
+    weights_ = ReorderWithScale(weights_, dst,
+                                ctx.Attr<std::vector<float>>("Scale_weights"));
   }
 
   void QuantizeBias(const inner_product_forward::primitive_desc& fc_prim_desc,
                     const ExecutionContext& ctx) {
     auto bias_scales = ComputeBiasScales(ctx);
-    bias_ = Reorder(*bias_, fc_prim_desc.bias_desc(), bias_scales);
+    bias_ = ReorderWithScale(bias_, fc_prim_desc.bias_desc(), bias_scales);
   }
 
   // Fuse relu into FC with activation type attribute has been set to 'relu'
@@ -463,10 +491,10 @@ class FCPrimitiveFactory {
 
  private:
   const mkldnn::engine& engine_;
-  boost::optional<memory> bias_;
   boost::optional<memory> input_;
   boost::optional<memory> output_;
-  boost::optional<memory> weights_;
+  std::shared_ptr<memory> bias_;
+  std::shared_ptr<memory> weights_;
   boost::optional<inner_product_forward> fc_;
 };
 
@@ -476,19 +504,13 @@ class FCPrimitiveFactory {
 template <typename T_in, typename T_w, typename T_out>
 static std::shared_ptr<FCPrimitiveFactory<T_in, T_w, T_out>>
 GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx,
-                    const ExecutionContext& ctx, const Tensor* input,
-                    const Tensor* weights,
-                    const mkldnn::engine& mkldnn_engine) {
-  const std::string key = platform::CreateKey(
-      platform::ThreadIDasStr(), input->format(), input->dims()[0],
-      framework::vectorize<int>(weights->dims()), ctx.OutputName("Out"));
-
+                    const std::string& key) {
   auto prim_creator =
       std::static_pointer_cast<FCPrimitiveFactory<T_in, T_w, T_out>>(
           dev_ctx.GetBlob(key));
   if (prim_creator == nullptr) {
-    prim_creator =
-        std::make_shared<FCPrimitiveFactory<T_in, T_w, T_out>>(mkldnn_engine);
+    prim_creator = std::make_shared<FCPrimitiveFactory<T_in, T_w, T_out>>(
+        dev_ctx.GetEngine());
     dev_ctx.SetBlob(key, prim_creator);
   }
 
@@ -498,24 +520,24 @@ GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx,
 // Choose appropriate primitive factory implementation based on inferred
 // output type (uint8, int8 or float).
 template <typename T_in, typename T_w>
-static void ExecuteFc(const MKLDNNDeviceContext& dev_ctx,
-                      const ExecutionContext& ctx, const LoDTensor* input,
+static void ExecuteFc(const ExecutionContext& ctx, const LoDTensor* input,
                       const Tensor* w, const Tensor* bias, LoDTensor* output,
-                      const mkldnn::engine& mkldnn_engine, bool fuse_relu,
-                      bool force_fp32_output) {
+                      bool fuse_relu, bool force_fp32_output) {
+  auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const std::string prim_key = platform::CreateKey(
+      platform::ThreadIDasStr(), input->format(), input->dims()[0],
+      framework::vectorize<int>(w->dims()), ctx.OutputName("Out"));
   constexpr bool is_int8 =
       std::is_same<T_in, int8_t>::value || std::is_same<T_in, uint8_t>::value;
   if (!is_int8 || force_fp32_output) {
-    GetPrimitiveFactory<T_in, T_w, float>(dev_ctx, ctx, input, w, mkldnn_engine)
-        ->ExecuteFcPrimitive(input, w, bias, output, ctx);
+    GetPrimitiveFactory<T_in, T_w, float>(dev_ctx, prim_key)
+        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
   } else if (fuse_relu) {
-    GetPrimitiveFactory<T_in, T_w, uint8_t>(dev_ctx, ctx, input, w,
-                                            mkldnn_engine)
-        ->ExecuteFcPrimitive(input, w, bias, output, ctx);
+    GetPrimitiveFactory<T_in, T_w, uint8_t>(dev_ctx, prim_key)
+        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
   } else {
-    GetPrimitiveFactory<T_in, T_w, int8_t>(dev_ctx, ctx, input, w,
-                                           mkldnn_engine)
-        ->ExecuteFcPrimitive(input, w, bias, output, ctx);
+    GetPrimitiveFactory<T_in, T_w, int8_t>(dev_ctx, prim_key)
+        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
   }
 }
 
@@ -526,9 +548,6 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T_in> {
     PADDLE_ENFORCE_EQ(
         platform::is_cpu_place(ctx.GetPlace()), true,
         platform::errors::PreconditionNotMet("FC MKL-DNN must use CPUPlace."));
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
     auto input = ctx.Input<LoDTensor>("Input");
     auto w = ctx.Input<Tensor>("W");
     auto bias = ctx.Input<Tensor>("Bias");
@@ -537,8 +556,8 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T_in> {
     bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu";
     bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
-    ExecuteFc<T_in, T_w>(dev_ctx, ctx, input, w, bias, output, mkldnn_engine,
-                         fuse_relu, force_fp32_output);
+    ExecuteFc<T_in, T_w>(ctx, input, w, bias, output, fuse_relu,
+                         force_fp32_output);
 
     output->set_layout(DataLayout::kMKLDNN);
   }
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index d0ecca78ae8b27451bc51a3c1561609fc470a9f8..98200caca8cf66960632b88966f23e99fcd4c299 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #include "paddle/fluid/operators/mean_op.h"
@@ -35,23 +36,11 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
     T* data = tensor->mutable_data<T>(context.GetPlace());
     int64_t size = tensor->numel();
     std::normal_distribution<T> dist(mean, std);
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
 
-    if (framework::Generator::GetInstance()->is_init_py) {
-      std::mt19937_64& gen_engine =
-          framework::Generator::GetInstance()->GetCPUEngine();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(gen_engine);
-      }
-    } else {
-      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-      std::minstd_rand engine;
-      if (seed == 0) {
-        seed = std::random_device()();
-      }
-      engine.seed(seed);
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
     }
 
     tensor->set_layout(DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index bde7131379a272e31fb1effe2f92204fa27f9a14..e3da79125be24f3156b10a4d1daedd3db2b841cf 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -24,49 +24,69 @@ class AdadeltaOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"),
-                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
-                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Param) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Grad) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("AvgSquaredGrad"), true,
+        platform::errors::InvalidArgument(
+            "Input(AvgSquaredGrad) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("AvgSquaredUpdate"), true,
+        platform::errors::InvalidArgument(
+            "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
         ctx->GetInputsVarType("Param").front() ==
             framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
+        true,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Param").front(),
+            ctx->GetInputsVarType("Param").front()));
+    PADDLE_ENFORCE_EQ(
         ctx->GetInputsVarType("Grad").front() ==
             framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredGradOut"),
-        "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredUpdateOut"),
-        "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.");
+        true,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Grad").front(),
+            ctx->GetInputsVarType("Grad").front()));
+
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("ParamOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(ParamOut) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("AvgSquaredGradOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(AvgSquaredGradOut) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("AvgSquaredUpdateOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."));
 
     auto param_dim = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
         param_dim, ctx->GetInputDim("Grad"),
         "param and grad input of AdadeltaOp should have same dimension");
-    PADDLE_ENFORCE_NE(framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
-                      "Maybe the Input variable AvgSquaredGrad has not "
-                      "been initialized. You may need to confirm if you put "
-                      "exe.run(startup_program) after optimizer.minimize "
-                      "function.");
+    PADDLE_ENFORCE_NE(
+        framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
+        platform::errors::InvalidArgument(
+            "Maybe the Input variable AvgSquaredGrad has not "
+            "been initialized. You may need to confirm if you put "
+            "exe.run(startup_program) after optimizer.minimize "
+            "function."));
     PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
-                      "Param and AvgSquaredGrad input of AdadeltaOp "
-                      "should have same dimension");
+                      platform::errors::InvalidArgument(
+                          "Param and AvgSquaredGrad input of AdadeltaOp "
+                          "should have same dimension"));
     PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
-                      "Param and AvgSquaredUpdate input of AdadeltaOp "
-                      "should have same dimension");
+                      platform::errors::InvalidArgument(
+                          "Param and AvgSquaredUpdate input of AdadeltaOp "
+                          "should have same dimension"));
 
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h
index e66dec7cf0ff686f91103e438b6374fce29af774..85cfad35858bbe6b112169f196c0711d981e9446 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.h
+++ b/paddle/fluid/operators/optimizers/adadelta_op.h
@@ -24,17 +24,19 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
     const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto avg_squared_grad_out_tensor =
diff --git a/paddle/fluid/operators/randint_op.cc b/paddle/fluid/operators/randint_op.cc
index 662fe3bcb3b3b2d26afaef0c9388dda329aea645..b3a2e14331955f42761601cabe52311e40c28eaa 100644
--- a/paddle/fluid/operators/randint_op.cc
+++ b/paddle/fluid/operators/randint_op.cc
@@ -46,22 +46,11 @@ class CPURandintKernel : public framework::OpKernel<T> {
 
     std::uniform_int_distribution<T> dist(ctx.Attr<int>("low"),
                                           ctx.Attr<int>("high") - 1);
+    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
 
-    if (framework::Generator::GetInstance()->is_init_py) {
-      std::mt19937_64& gen_engine =
-          framework::Generator::GetInstance()->GetCPUEngine();
-      for (int64_t i = 0; i < size; ++i) data[i] = dist(gen_engine);
-    } else {
-      unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-      std::minstd_rand engine;
-      if (seed == 0) {
-        seed = std::random_device()();
-      }
-      engine.seed(seed);
-
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
     }
   }
 };
diff --git a/paddle/fluid/operators/randperm_op.h b/paddle/fluid/operators/randperm_op.h
index 0eb028ad806848a559ba51b9c950d324a598a851..02aabb9a7b569c5aa8354f191c0de1497d9f9324 100644
--- a/paddle/fluid/operators/randperm_op.h
+++ b/paddle/fluid/operators/randperm_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <ctime>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -29,20 +30,12 @@ namespace operators {
 
 template <typename T>
 static inline void random_permate(T* data_ptr, int num, unsigned int seed) {
+  auto engine = framework::GetCPURandomEngine(seed);
   for (int i = 0; i < num; ++i) {
     data_ptr[i] = static_cast<T>(i);
   }
-  if (framework::Generator::GetInstance()->is_init_py) {
-    std::shuffle(data_ptr, data_ptr + num,
-                 framework::Generator::GetInstance()->GetCPUEngine());
 
-  } else {
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    std::srand(seed);
-    std::random_shuffle(data_ptr, data_ptr + num);
-  }
+  std::shuffle(data_ptr, data_ptr + num, *engine);
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index a09220b1ccd13604b6d842237c8176578967ac64..9bec08f593afebae736fcbb3eb42fc20992df779 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -51,20 +51,15 @@ class SamplingIdKernel : public framework::OpKernel<T> {
     framework::TensorToVector(*input, context.device_context(), &ins_vector);
 
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
+
     std::uniform_real_distribution<T> dist(
         static_cast<T>(context.Attr<float>("min")),
         static_cast<T>(context.Attr<float>("max")));
 
+    auto engine = framework::GetCPURandomEngine(seed);
     std::vector<int64_t> ids(batch_size);
     for (int i = 0; i < batch_size; ++i) {
-      T r = framework::Generator::GetInstance()->is_init_py
-                ? dist(framework::Generator::GetInstance()->GetCPUEngine())
-                : dist(engine);
+      T r = dist(*engine);
       int idx = width - 1;
       for (int j = 0; j < width; ++j) {
         if ((r -= ins_vector[i * width + j]) < 0) {
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 1fbf6d00ef763f4cb608be6d62cf4bff54f620ec..d3f9754d307c6040a66a3452d7bb008159ff46e5 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -23,22 +23,27 @@ class TopkOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
-                   "Output(Indices) of TopkOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of TopkOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Out) of TopkOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Indices"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Indices) of TopkOp should not be null."));
 
     auto input_dims = ctx->GetInputDim("X");
     const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
 
     PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
+    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
+                                                "input must have >= 1d shape"));
 
     if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
-                        "input must have >= k columns");
+      PADDLE_ENFORCE_GE(
+          input_dims[input_dims.size() - 1], k,
+          platform::errors::InvalidArgument("input must have >= k columns"));
     }
 
     framework::DDim dims = input_dims;
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index d8b2e92616091a8c822c6fd0bfdfb1148c25534d..0a694e1ad5b012d70a89ddcca2d70fbe8c9e24ba 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -43,8 +43,9 @@ template <typename DeviceContext, typename T>
 class TopkOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("It must use CUDAPlace."));
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index 3aa9ff544af63993521d41604cecef0b283ebc1e..419f0f7a2a57822b422986f9b676de138a7404cd 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <limits>
 #include <random>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -167,22 +168,10 @@ class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     TruncatedNormal<T> truncated_normal(mean, std);
     int64_t size = tensor->numel();
 
-    if (framework::Generator::GetInstance()->is_init_py) {
-      std::mt19937_64& gen_engine =
-          framework::Generator::GetInstance()->GetCPUEngine();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = truncated_normal(dist(gen_engine));
-      }
-    } else {
-      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-      std::minstd_rand engine;
-      if (seed == 0) {
-        seed = std::random_device()();
-      }
-      engine.seed(seed);
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = truncated_normal(dist(engine));
-      }
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = truncated_normal(dist(*engine));
     }
   }
 };
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index a4487cde277990a725fd4c37b6d807278e314343..9cffe09a33abf29308072d6b3c8bfb8a636048da 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/uniform_random_op.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -62,34 +64,12 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
     std::uniform_real_distribution<T> dist(
         static_cast<T>(ctx.Attr<float>("min")),
         static_cast<T>(ctx.Attr<float>("max")));
-    auto gen_ptr = framework::Generator::GetInstance();
-    if (gen_ptr->is_init_py) {
-      std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
-      // auto gen_engine = gen_ptr_->GetCPUEngine();
-      // std::uniform_real_distribution<T> dist(
-      //    static_cast<T>(ctx.Attr<float>("min")),
-      //    static_cast<T>(ctx.Attr<float>("max")));
+    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
 
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(gen_engine);
-      }
-    } else {
-      unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-      std::minstd_rand engine;
-      if (seed == 0) {
-        seed = std::random_device()();
-      }
-      engine.seed(seed);
-      // std::uniform_real_distribution<T> dist(
-      //    static_cast<T>(ctx.Attr<float>("min")),
-      //    static_cast<T>(ctx.Attr<float>("max")));
-      // int64_t size = tensor->numel();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
     }
-    // std::mt19937_64 &engine = gen_ptr->GetCPUEngine();
-    // auto engine = gen_ptr_->GetCPUEngine();
 
     unsigned int diag_num =
         static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
@@ -139,12 +119,12 @@ class UniformRandomOp : public framework::OperatorWithKernel {
     if (ctx->HasInputs("ShapeTensorList")) {
       // top prority shape
       auto inputs_name = ctx->Inputs("ShapeTensorList");
-      PADDLE_ENFORCE_GT(
-          inputs_name.size(), 0,
-          platform::errors::InvalidArgument(
-              "Input(ShapeTensorList)'size of Op(uniform_random) can't be zero."
-              "Please check the Attr(shape)'s size of"
-              "Op(fluid.layers.uniform_random).)"));
+      PADDLE_ENFORCE_GT(inputs_name.size(), 0,
+                        platform::errors::InvalidArgument(
+                            "Input(ShapeTensorList)'size of "
+                            "Op(uniform_random) can't be zero."
+                            "Please check the Attr(shape)'s size of"
+                            "Op(fluid.layers.uniform_random).)"));
       auto out_dims = std::vector<int>(inputs_name.size(), -1);
       ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
 
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index c024bb87b09c00c34dbaaf7b747f29743152502f..4df1e0ffeb97564803f452114d52ab03d0464f8a 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -88,15 +89,12 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    if (framework::Generator::GetInstance()->is_init_py) {
-      seed = static_cast<unsigned int>(
-          framework::Generator::GetInstance()->GetCurrentSeed());
-    } else {
-      if (seed == 0) {
-        std::random_device rd;
-        seed = rd();
-      }
+
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
     }
+
     T min = static_cast<T>(context.Attr<float>("min"));
     T max = static_cast<T>(context.Attr<float>("max"));
     unsigned int diag_num =
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 3bccd5fb2dd92298323381c09467937abd87a53c..90b7f501052530a306ba22ea6a244f0ef8fad563 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -29,23 +29,36 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace pybind {
-void BindGenerator(py::module* m) {
-  py::class_<framework::GeneratorState>(*m, "GeneratorState", "");
-  py::class_<std::mt19937_64>(*m, "mt19937_64", "");
+void BindGenerator(py::module* m_ptr) {
+  auto& m = *m_ptr;
+  py::class_<framework::GeneratorState,
+             std::shared_ptr<framework::GeneratorState>>(m, "GeneratorState")
+      .def("current_seed",
+           [](std::shared_ptr<framework::GeneratorState>& self) {
+             return self->current_seed;
+           });
+  py::class_<std::mt19937_64>(m, "mt19937_64", "");
   py::class_<framework::Generator, std::shared_ptr<framework::Generator>>(
-      *m, "Generator")
-      .def(py::init([]() { return framework::Generator::GetInstanceX(); }),
-           py::return_value_policy::reference)
-      .def("get_state", &framework::Generator::GetState,
-           py::return_value_policy::move)
+      m, "Generator")
+      .def("__init__",
+           [](framework::Generator& self) {
+             new (&self) framework::Generator();
+           })
+      .def("get_state", &framework::Generator::GetState)
       .def("set_state", &framework::Generator::SetState)
-      .def("manual_seed", &framework::Generator::SetCurrentSeed)
+      .def("manual_seed",
+           [](std::shared_ptr<framework::Generator>& self, uint64_t seed) {
+             self->SetCurrentSeed(seed);
+             return self;
+           })
       .def("seed", &framework::Generator::Seed)
       .def("initial_seed", &framework::Generator::GetCurrentSeed)
       .def("random", &framework::Generator::Random64)
-      .def("get_cpu_engine", &framework::Generator::GetCPUEngine,
-           py::return_value_policy::move)
-      .def("set_cpu_engine", &framework::Generator::SetCPUEngine);
+      //  .def("get_cpu_engine", &framework::Generator::GetCPUEngine)
+      //  .def("set_cpu_engine", &framework::Generator::SetCPUEngine)
+      .def_property("_is_init_py", &framework::Generator::GetIsInitPy,
+                    &framework::Generator::SetIsInitPy);
+  m.def("default_cpu_generator", &framework::DefaultCPUGenerator);
 }  // end Generator
 }  // end namespace pybind
-}  // end namespace paddle
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index cf0dac022f74e47261fc28d02665bcde49dc8b39..040dd313f1c538b5792538f9da04635ff805b9a8 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -206,9 +206,9 @@ void BindInferenceApi(py::module *m) {
   BindMkldnnQuantizerConfig(m);
 #endif
   m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<AnalysisConfig>);
+         &paddle::CreatePaddlePredictor<AnalysisConfig>, py::arg("config"));
   m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<NativeConfig>);
+         &paddle::CreatePaddlePredictor<NativeConfig>, py::arg("config"));
   m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
   m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
 }
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index c84574b21d883b24e1f89c59c3a724aae6621479..862ab2e8db1fdc353db826204d759d99951d5142 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -125,8 +125,15 @@ echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
 
-echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
-cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
+echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
+-DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
+
+cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
+-DON_INFER=%ON_INFER%  -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
 goto:eof
 
 :cmake_error
@@ -276,7 +283,10 @@ echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_
 echo fi>>  check_change_of_unittest.sh
 echo git checkout -b origin_pr >>  check_change_of_unittest.sh
 echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% >>  check_change_of_unittest.sh
+echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
+-DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% >>  check_change_of_unittest.sh
 echo cat ^<^<EOF>>  check_change_of_unittest.sh
 echo     ============================================       >>  check_change_of_unittest.sh
 echo     Generate unit tests.spec of develop.               >>  check_change_of_unittest.sh
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a77d605eb6c26b02c38a58195d1f8f1e84a3dc20..926747ef6186e3b9439baf787572fe9d1988fb46 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1399,6 +1399,9 @@ function main() {
     local CMD=$1 
     local parallel_number=$2
     init
+    if [ "$CMD" != "assert_file_approvals" ];then
+      python ${PADDLE_ROOT}/tools/summary_env.py
+    fi
     case $CMD in
       build_only)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index c22eee3df6f294d0e364b734c9472a0ef62270e4..46b84697e5a61e164cbc826d5018db7a6d87f69f 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -230,8 +230,6 @@ from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
-from .framework import prepare_context  #DEFINE_ALIAS
-from .framework import ParallelEnv  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
 
 from .framework import NoamDecay  #DEFINE_ALIAS
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
index bb9830132e987370022df3192060de3e908a2e85..3540ea06b075ed9b649af803c5a655a1e737723b 100644
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -42,9 +42,11 @@ class TestSentimentMethods(unittest.TestCase):
     def test_data_set(self):
         data_set = st.load_sentiment_data()
         last_label = -1
+
         for each in st.test():
             self.assertNotEqual(each[1], last_label)
             last_label = each[1]
+
         self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
         self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
         self.assertEqual(
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 34dd605f901b4357682dc514d59d110db74f9d5b..b7357eef7ad9a3abae7f9c1c09fdc00b409061ad 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -12,4 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from . import spawn
+from .spawn import spawn
+
+from . import parallel
+from .parallel import init_parallel_env
+from .parallel import get_rank
+from .parallel import get_world_size
+from paddle.fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
+from paddle.fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
+
+from . import collective
 from .collective import *
+
+# start multiprocess apis
+__all__ = ["spawn"]
+
+# dygraph parallel apis
+__all__ += [
+    "init_parallel_env",
+    "get_rank",
+    "get_world_size",
+    "prepare_context",
+    "ParallelEnv",
+]
+
+# collective apis
+__all__ += collective.__all__
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index b080fb17553d4a93a545f4ae781d786d82e26576..42ac68ba1a64de54f029878ceab08435c924d087 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -18,16 +18,15 @@ from .base.distributed_strategy import DistributedStrategy
 from .base.fleet_base import Fleet
 from .base.util_factory import UtilBase
 from .dataset import *
+#from . import metrics
 
 __all__ = [
     "DistributedStrategy",
     "UtilBase",
     "DatasetFactory",
-    "DatasetBase",
-    "InMemoryDataset",
-    "QueueDataset",
     "UserDefinedRoleMaker",
     "PaddleCloudRoleMaker",
+    "Fleet",
 ]
 
 fleet = Fleet()
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 5e527ea03ab9c816948f343ac103672a751fdbc3..26063d1b8a9225aff63628bb37f433ec95257dc7 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -17,6 +17,8 @@ from paddle.distributed.fleet.proto import distributed_strategy_pb2
 from paddle.fluid.framework import Variable, set_flags, core
 import google.protobuf.text_format
 
+__all__ = ["DistributedStrategy"]
+
 
 def get_msg_dict(msg):
     res_dict = {}
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index a6286bcca87fad1afddbd8af1e56dad05dab2578..eb2cb19eaecf01f7209e7974e3c47c26810ce21f 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -22,7 +22,7 @@ from .runtime_factory import RuntimeFactory
 from .util_factory import UtilFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
 
-__all__ = ['Fleet']
+#__all__ = ['Fleet']
 
 
 def _inited_runtime_handler_(func):
diff --git a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
index 459070fcc4dbef3711c33b2932e8f1c88647aab5..f845b3fcd8953c44c8b5b857dac08be1c7269958 100755
--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ["MetaOptimizerFactory"]
-
 from ..meta_optimizers import *
 
 meta_optimizer_names = list(
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 6aeeb4a2896ea1d20390e463937aa07d3edd0204..3d159a631226f4e52c9cc6ab6d36275265a25f97 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -17,7 +17,7 @@ import numpy as np
 from multiprocessing import Process, Manager
 import paddle.fluid as fluid
 
-__all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+#__all__ = ['UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
 
 
 class Role:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index 075e8b6c4302d792606849fc2981e46ccead1e56..78b2b8117b9f6eecfcb410f3cca0b1ec3e99a93b 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -22,17 +22,3 @@ from .lars_optimizer import LarsOptimizer
 from .async_graph_execution_optimizer import AsyncGraphExecutionOptimizer
 from .dgc_optimizer import DGCOptimizer
 from .lamb_optimizer import LambOptimizer
-
-__all__ = [
-    'AMPOptimizer',
-    'RecomputeOptimizer',
-    'GradientMergeOptimizer',
-    'AsyncMetaOptimizer',
-    'GraphExecutionOptimizer',
-    'PipelineOptimizer',
-    'LocalSGDOptimizer',
-    'LarsOptimizer',
-    'AsyncGraphExecutionOptimizer',
-    'DGCOptimizer',
-    'LambOptimizer',
-]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 66db14209b4c57475c30c6dde083593e27f04ea0..b1952276e44cd1466bc443440505462924115ab7 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -14,8 +14,6 @@
 import paddle.fluid.contrib.mixed_precision as mixed_precision
 from .meta_optimizer_base import MetaOptimizerBase
 
-__all__ = ["AMPOptimizer"]
-
 
 class AMPOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index f34786f9dc309dd1f03319368bbc93ef1bfc03e3..f1c6defc5c982c7d56980642898aaa333c199bbe 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -15,8 +15,6 @@ from paddle.fluid.optimizer import Momentum, DGCMomentumOptimizer
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
-__all__ = ["DGCOptimizer"]
-
 
 class DGCOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index bd52179a35862241768ad5bd01eedf16732ad3b6..7db79ad7b5b7081172209faa2396d9f2a31bbdb3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -14,10 +14,6 @@
 from paddle.fluid.optimizer import GradientMergeOptimizer as GM
 from .meta_optimizer_base import MetaOptimizerBase
 
-__all__ = ["GradientMergeOptimizer"]
-
-# amp + gradient merge + lamb
-
 
 class GradientMergeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index 7e08a02eb1dc2e14b1871fe7743bbee8ade3feb3..9fa29c4078e9f579a740ef8c0591979e7fbb962d 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -16,8 +16,6 @@ from paddle.fluid.optimizer import LambOptimizer as LAMB
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
-__all__ = ["LambOptimizer"]
-
 
 class LambOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index 09c418fa79106d05cffae1e8bc18fac9c0cc8f34..a7b856ff5b0dcb1ab30de82a12c91a2e1c14fe76 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -15,8 +15,6 @@ from paddle.fluid.optimizer import Momentum, LarsMomentumOptimizer
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
-__all__ = ["LarsOptimizer"]
-
 
 class LarsOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index 12a4d904340337bf9a99968c7d82db117bf59ce8..073148e11a0a2b08253b89d36d7a014b830518f8 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ["MetaOptimizerBase"]
-
 from paddle.fluid.optimizer import Optimizer
 
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index fe9221307cbacfa1beaf030b70a4e4b9223769cc..d5a45e2b4e1aeda2e1c66c0a5a36236622f093ec 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -20,8 +20,6 @@ from paddle.fluid.optimizer import PipelineOptimizer as PO
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
 
-__all__ = ["PipelineOptimizer"]
-
 
 class PipelineHelper(CollectiveHelper):
     def __init__(self, role_maker, nrings=1, wait_port='6174'):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 45130b447125f6ecbade2e4e5e3dad2f127fda52..3eb3ca6127cfe0d0a7a458c6c44e09ce22e7b24a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -14,8 +14,6 @@
 from paddle.fluid.optimizer import RecomputeOptimizer as RO
 from .meta_optimizer_base import MetaOptimizerBase
 
-__all__ = ["RecomputeOptimizer"]
-
 
 class RecomputeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/metrics/__init__.py b/python/paddle/distributed/fleet/metrics/__init__.py
index abf198b97e6e818e1fbe59006f98492640bcee54..bc30c063787d28e5bcb4455b3cbd56372879fe0a 100644
--- a/python/paddle/distributed/fleet/metrics/__init__.py
+++ b/python/paddle/distributed/fleet/metrics/__init__.py
@@ -11,3 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .metric import *
+
+__all__ = [
+    "sum",
+    "max",
+    "min",
+    "auc",
+    "mae",
+    "rmse",
+    "mse",
+    "acc",
+]
diff --git a/python/paddle/distributed/fleet/runtime/__init__.py b/python/paddle/distributed/fleet/runtime/__init__.py
index a796a73fc981b7edbcd57e8f5858456031e7ae6e..cf718b199e52e422ff8f2b66317f3cd6123c76a1 100644
--- a/python/paddle/distributed/fleet/runtime/__init__.py
+++ b/python/paddle/distributed/fleet/runtime/__init__.py
@@ -14,5 +14,3 @@
 
 from .collective_runtime import CollectiveRuntime
 from .parameter_server_runtime import ParameterServerRuntime
-
-__all__ = ["CollectiveRuntime," "ParameterServerRuntime", ]
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index 212308159aabb123fde11543b3482f2232b4925d..f1911408c84a9dde56a8674e88e0fb8ad575cae7 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -15,4 +15,4 @@
 from .fs import *
 from .http_server import KVHandler, KVHTTPServer, KVServer
 
-__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
+#__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index ecd1cf0ca7bef6586e4833ce80c48eb08a6ad2ee..e2ab321f9aebddd437c92ded9e6005495f760096 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -44,11 +44,9 @@ import time
 import six
 import copy
 from argparse import ArgumentParser, REMAINDER
-import paddle
-import paddle.fluid as fluid
 
 from paddle.distributed.utils import *
-import paddle.distributed.cloud_utils as cloud_utils
+from paddle.distributed import cloud_utils
 
 
 def _print_arguments(args):
@@ -167,7 +165,8 @@ def get_cluster_from_args(args, selected_gpus):
 
 def get_gpus(selected_gpus):
     if selected_gpus is None:
-        gpus_num = fluid.core.get_cuda_device_count()
+        from paddle.fluid import core
+        gpus_num = core.get_cuda_device_count()
         selected_gpus = [str(x) for x in range(0, gpus_num)]
     else:
         cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
@@ -190,7 +189,7 @@ def get_gpus(selected_gpus):
     return selected_gpus
 
 
-def launch(args):
+def get_cluster_and_pod(args):
     # parse arguments, used for cloud-single-machine and local
     selected_gpus = get_gpus(args.selected_gpus)
     trainers_num = cloud_utils.get_trainers_num()
@@ -209,6 +208,12 @@ def launch(args):
         cluster, pod = get_cluster_from_args(args, selected_gpus)
         logger.info("get cluster from args:{}".format(cluster))
 
+    return cluster, pod
+
+
+def launch(args):
+    cluster, pod = get_cluster_and_pod(args)
+
     procs = start_local_trainers(
         cluster,
         pod,
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c806747217add5022b0c6ea66e184b44ef56836
--- /dev/null
+++ b/python/paddle/distributed/parallel.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except jin compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import six
+import warnings
+
+from paddle import compat as cpt
+
+# deprecated module import
+from paddle.fluid import core
+from paddle.fluid.framework import _set_expected_place
+from paddle.fluid.dygraph import parallel_helper
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+__all__ = ["init_parallel_env"]
+
+ParallelStrategy = core.ParallelStrategy
+
+
+def init_parallel_env(backend='nccl'):
+    """
+    Initialize parallel training environments in dynamic mode.
+
+    Args:
+        backend(str, optional): The backend to communication between multiple devices.
+            Now only support ``nccl`` . Default value is ``nccl`` .
+
+    Returns:
+        None
+        
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                dist.init_parallel_env()
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = paddle.DataParallel(layer)
+
+                loss_fn = nn.MSELoss()
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                dist.spawn(train)
+    """
+
+    # 1. input check
+    if not isinstance(backend, six.string_types):
+        raise TypeError("input `backend` type error, expected type is str, "
+                        "but received type is %s." % type(backend))
+    if cpt.to_text(backend) != 'nccl':
+        raise ValueError(
+            "backend `%s` is not supported, now only supports `nccl` backend." %
+            backend)
+
+    # 2. check env
+    def _check_var_exists(var_name):
+        var = os.environ.get(var_name, None)
+        if var is None:
+            raise ValueError("paddle.distributed initialize error, "
+                             "environment variable %s is needed, but not set." %
+                             var_name)
+
+    _check_var_exists("FLAGS_selected_gpus")
+    _check_var_exists("PADDLE_TRAINER_ID")
+    _check_var_exists("PADDLE_CURRENT_ENDPOINT")
+    _check_var_exists("PADDLE_TRAINERS_NUM")
+    _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
+
+    # 3. init ParallelStrategy
+    strategy = ParallelStrategy()
+    if cpt.to_text(backend) == 'nccl':
+        if parallel_helper._is_parallel_ctx_initialized():
+            warnings.warn("The parallel environment has been initialized.")
+        strategy.nranks = ParallelEnv().world_size
+        strategy.local_rank = ParallelEnv().rank
+        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+        strategy.current_endpoint = ParallelEnv().current_endpoint
+        if strategy.nranks < 2:
+            return
+        # NOTE(chenweihang): [ why config global place here? ]
+        # the dygraph mode will be set to default mode, 
+        # users will not call `dygraph.guard` or `enable_dygraph`
+        # directly, if they want to switch default place,
+        # they need to call a function to change default place,
+        # here just set correctly place to users
+        place = core.CUDAPlace(ParallelEnv().device_id)
+        _set_expected_place(place)
+
+        # init nccl context
+        parallel_helper._set_parallel_ctx(
+            core.NCCLParallelContext(strategy, place))
+        parallel_helper._init_parallel_ctx()
+
+
+def get_rank():
+    """
+    Returns the rank of current trainer.
+
+    Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . 
+    The default value is 0.
+
+    Returns:
+        (int) The rank of current trainer.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.distributed as dist
+
+            # execute this command in terminal: export PADDLE_TRAINER_ID=0
+            print("The rank is %d" % dist.get_rank())
+            # The rank is 0
+    """
+    return ParallelEnv().rank
+
+
+def get_world_size():
+    """
+    The number of trainers (number of processes participating in current job).
+
+    Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . 
+    The default value is 1.
+
+    Returns:
+        (int) The number of trainers.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.distributed as dist
+
+            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+            print("The world_size is %d" % dist.get_world_size())
+            # The world_size is 4
+    """
+    return ParallelEnv().world_size
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ca2ebaa8d4bd3e0f11e41cdcc35ab585a70b802
--- /dev/null
+++ b/python/paddle/distributed/spawn.py
@@ -0,0 +1,415 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import multiprocessing
+import os
+import signal
+import six
+import sys
+import warnings
+
+from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
+from paddle.distributed.utils import _prepare_trainer_env
+from paddle.device import get_device
+
+# deprecated module import
+from paddle.fluid import core
+from paddle.fluid.framework import _cpu_num
+
+
+# NOTE(chenweihang): The existence of this class leads to 
+# the maintenance of two arguments. When the launch.py arguments 
+# is updated, the arguments here also need to be updated, 
+# but I have not thought of a better way here
+class ParallelEnvArgs(object):
+    def __init__(self):
+        # Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..
+        self.cluster_node_ips = None
+
+        # The current node ip.
+        self.node_ip = None
+
+        # whether to use paddlecloud platform to run your multi-process job.
+        # If false, no need to set this argument.
+        self.use_paddlecloud = None
+
+        # The trainer's started port on a single node
+        self.started_port = None
+
+        # Print the config or not
+        self.print_config = True
+
+        # It's for gpu training and the training process will run 
+        # on the selected_gpus, each process is bound to a single GPU. 
+        # And if it's not set, this module will use all the gpu cards 
+        # for training.
+        self.selected_gpus = None
+
+
+def _py_supported_check():
+    if not sys.version_info >= (3, 4):
+        raise RuntimeError(
+            "Use `paddle.distributed.spawn` to start parallel training "
+            "requires python version greater than 3.4, if your python "
+            "is lower than this version, please use "
+            "`paddle.distributed.launch` instead.")
+
+
+def _get_subprocess_env_list(nprocs, options):
+    # contruct processes env list
+    processes_env_list = []
+
+    # get args from kwargs
+    args = ParallelEnvArgs()
+
+    # set default `node_ip` and `cluster_node_ips`
+    args.cluster_node_ips = options.get('cluster_node_ips', None)
+    args.node_ip = options.get('node_ip', None)
+    if args.cluster_node_ips is not None and args.node_ip is None:
+        raise ValueError("please input current node ip, "
+                         "cannot only give `cluster_node_ips`.")
+    default_node_ip = "127.0.0.1"
+    if args.node_ip is None:
+        args.node_ip = default_node_ip
+    if args.cluster_node_ips is None:
+        args.cluster_node_ips = default_node_ip
+
+    # set default selected gpus
+    # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3"
+    # NOTE(chenweihang): [ why not use FLAGS_selected_gpus directly? ]
+    # because the FLAGS_selected_gpus may be used in other place,
+    # if we set FLAGS_selected_gpus to be `0,1,2,3`, it may cause error
+    # when using `ParallelEnv`
+    # NOTE(chenweihang): use absolute gpu card id
+    args.selected_gpus = options.get('selected_gpus', None)
+    env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
+    if env_devices is None or env_devices == "":
+        env_devices_list = [
+            str(x) for x in six.moves.range(core.get_cuda_device_count())
+        ]
+    else:
+        env_devices_list = env_devices.split(',')
+    if args.selected_gpus is None:
+        if len(env_devices_list) < nprocs:
+            raise RuntimeError(
+                "the number of visible devices(%d) is less than the number "
+                "of spawn processes(%d), please ensure that the correct "
+                "`nprocs` argument is passed or the environment variable "
+                "`CUDA_VISIBLE_DEVICES` is correctly configured." %
+                (len(env_devices_list), nprocs))
+        args.selected_gpus = ",".join(
+            [str(env_devices_list[x]) for x in range(0, nprocs)])
+    else:
+        for card_id in args.selected_gpus.split(','):
+            if card_id not in env_devices_list:
+                raise ValueError("The selected gpu card %s cannot found in "
+                                 "CUDA_VISIBLE_DEVICES (%s)." %
+                                 (card_id, ",".join(env_devices_list)))
+
+    # set other arguments
+    args.started_port = options.get('started_port', None)
+    args.use_paddlecloud = options.get('use_paddlecloud', False)
+    args.print_config = options.get('print_config', False)
+
+    # reuse code of launch.py
+    cluster, pod = get_cluster_and_pod(args)
+
+    # prepare subprocess env list
+    for trainer in pod.trainers:
+        processes_env_list.append(_prepare_trainer_env(cluster, trainer))
+
+    # print config
+    if args.print_config:
+        _print_arguments(args)
+
+    return processes_env_list
+
+
+def _remove_risky_env():
+    # remove useless env vars, same as launch.py
+    # no copy, each process will hold env vars itself
+    os.environ.pop("http_proxy", None)
+    os.environ.pop("https_proxy", None)
+
+
+def _set_trainer_env(env_dict):
+    for var_name in env_dict:
+        os.environ[var_name] = env_dict[var_name]
+
+
+def _func_wrapper(func, args, error_queue, return_queue, env_dict):
+    try:
+        # config subprocess environment variables
+        _remove_risky_env()
+        _set_trainer_env(env_dict)
+        # execute function
+        result = func(*args)
+        # record function return value
+        return_queue.put(result)
+    except KeyboardInterrupt:
+        pass
+    except Exception:
+        import traceback
+        error_queue.put(traceback.format_exc())
+        sys.exit(1)
+
+
+class MultiprocessContext(object):
+    def __init__(self, processes, error_queues, return_queues):
+        _py_supported_check()
+        self.error_queues = error_queues
+        # NOTE(chenweihang): The `spawn` method is mainly used 
+        # to wrap the outermost execution function of the program for 
+        # parallel execution. Generally, the return value is not concerned, 
+        # but if the user needs to obtain the return value, users can get  
+        # the return result of each process from context.return_queues
+        self.return_queues = return_queues
+        self.processes = processes
+        self.sentinels = {
+            process.sentinel: index
+            for index, process in enumerate(processes)
+        }
+
+    def join(self, timeout=None):
+        if len(self.sentinels) == 0:
+            return True
+
+        ready = multiprocessing.connection.wait(
+            self.sentinels.keys(), timeout=timeout)
+
+        error_index = None
+        for sentinel in ready:
+            index = self.sentinels.pop(sentinel)
+            process = self.processes[index]
+            process.join()
+            if process.exitcode != 0:
+                error_index = index
+                break
+
+        if error_index is None:
+            return len(self.sentinels) == 0
+
+        for process in self.processes:
+            if process.is_alive():
+                process.terminate()
+            process.join()
+
+        self._throw_exception(error_index)
+
+    def _throw_exception(self, error_index):
+        if self.error_queues[error_index].empty():
+            exitcode = self.processes[error_index].exitcode
+            if exitcode < 0:
+                name = signal.Signals(-exitcode).name
+                raise Exception("Process %d terminated with signal %s." %
+                                (error_index, name))
+            else:
+                raise Exception("Process %d terminated with exit code %d." & (
+                    error_index, exitcode))
+
+        original_trace = self.error_queues[error_index].get()
+        msg = "\n\n----------------------------------------------\n" \
+              "Process %d terminated with the following error:\n" \
+              "----------------------------------------------\n\n" % error_index
+        msg += original_trace
+        raise Exception(msg)
+
+
+def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
+    """
+    Start multiple processes with ``spawn`` method for parallel training.
+
+    Args:
+        func (function): The target function is called by spawned process.
+            This function need to be able to pickled, so it must be defined
+            at the top level of a module.
+            This function should be called as ``func(i, *args)``, ``i`` is
+            the process index and ``args`` contains other arguments as tuple.
+        args (tuple, optional): Arguments passed to ``func``.
+        nprocs (int, optional): Number of processed to start. Default: -1.
+            when nprocs is -1, the available device will be obtained from 
+            the environment variable when the model is executed: If use GPU, 
+            the currently available device ID is obtained from the environment 
+            variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available
+            CPU number is obtained from the environment variable CPU_NUM. 
+            For example, export CPU_NUM=4, if the environment variable is not set, 
+            the executor will add the variable to the environment variable and 
+            set its value to 1.
+        join (bool, optional): Perform a blocking join on all spawned processes.
+            Default: True.
+        daemon (bool, optional): The spawned processes' daemon flag. Default: False.
+        **options(dict, optional): Other initial parallel execution environment 
+            configuration options. The following options are currently supported: 
+            (1) start_method (string): the way to start a process. 
+            The start method can be ``spawn`` , ``fork`` , ``forkserver`` . 
+            Because the CUDA runtime does not support the ``fork`` start method, 
+            when use CUDA in subprocesses, we should start process by ``spawn`` 
+            or ``forkserver`` method. Default: "spawn" ; 
+            (2) cluster_node_ips (string): Paddle cluster nodes ips, such as 
+            "192.168.0.16,192.168.0.17". Default: "127.0.0.1"; 
+            (3) node_ip (string): The current node ip, such as "192.168.0.16". 
+            Default: "127.0.0.1"; 
+            (4) started_port (int): The trainer's started port on a single node,
+            such as 6170. Default: None; 
+            (5) selected_gpus (string): The training process will run on the 
+            selected_gpus, such as "0,1,2,3". Default: None; 
+            (6) print_config: Print current parallel training config. Default: False;
+            (7) use_paddlecloud: Whether to use paddlecloud platform to run your 
+            multi-process job. Default: False.
+
+    Returns:
+        ``MultiprocessContext`` object, it hold the spawned processes.
+
+    Examples:
+        .. code-block:: python
+
+            from __future__ import print_function
+
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train(print_result=False):
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                dist.init_parallel_env()
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = paddle.DataParallel(layer)
+
+                loss_fn = nn.MSELoss()
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                if print_result is True:
+                    print("loss:", loss.numpy())
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            # Usage 1: only pass function. 
+            # If your training method no need any argument, and 
+            # use all visible devices for parallel training. 
+            if __name__ == '__main__':
+                dist.spawn(train)
+
+            # Usage 2: pass function and arguments.
+            # If your training method need some arguments, and 
+            # use all visible devices for parallel training.
+            if __name__ == '__main__':
+                dist.spawn(train, args=(True,))
+
+            # Usage 3: pass function, arguments and nprocs.
+            # If your training method need some arguments, and 
+            # only use part of visible devices for parallel training.
+            # If your machine hold 8 cards {0,1,2,3,4,5,6,7},
+            # this case will use cards {0,1}; If you set 
+            # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use
+            # cards {4,5}
+            if __name__ == '__main__':
+                dist.spawn(train, args=(True,), nprocs=2)
+
+            # Usage 4: pass function, arguments, nprocs and selected_gpus.
+            # If your training method need some arguments, and 
+            # only use part of visible devices for parallel training,
+            # but you can't set your machine's environment varibale 
+            # CUDA_VISIBLE_DEVICES, such as it is None or all cards
+            # {0,1,2,3,4,5,6,7}, you can pass `selelcted_gpus` to 
+            # select the GPU cards you want to use. For example,
+            # this case will use cards {4,5} if your machine hold 8 cards.
+            if __name__ == '__main__':
+                dist.spawn(train, args=(True,), nprocs=2, selelcted_gpus='4,5')
+    """
+    # NOTE(chenweihang): [ why only supports python3.4+ ? ]
+    # Python supported setting the child process startup method
+    # since 3.4. The previous version can only use the default startup 
+    # method, while the default startup method of Unix is fork, which 
+    # cannot support CUDA runtime multi-process
+    _py_supported_check()
+
+    # get default nprocs
+    if nprocs == -1:
+        device = get_device()
+        if device == 'cpu':
+            # TODO: not supports cpu parallel now
+            nprocs = _cpu_num
+        else:
+            nprocs = core.get_cuda_device_count()
+
+    # NOTE(chenweihang): [ why need get cluster info before run? ]
+    # when using `paddle.distributed.spawn` start parallel training, 
+    # we should get cluster info before starting subprocess, and pass 
+    # correct info to each subprocess
+    procs_env_list = _get_subprocess_env_list(nprocs, options)
+
+    # start processes
+    # NOTE(chenweihang): [ why default start method is spawn? ]
+    # The CUDA runtime does not support the fork start method, 
+    # either the spawn or forkserver start method are required 
+    # to use CUDA in subprocesses.
+    start_method = options.get('start_method', None)
+    if start_method is None:
+        start_method = 'spawn'
+    mp = multiprocessing.get_context(start_method)
+
+    error_queues = []
+    return_queues = []
+    processes = []
+    for i in range(nprocs):
+        error_queue = mp.SimpleQueue()
+        return_queue = mp.SimpleQueue()
+        process = mp.Process(
+            target=_func_wrapper,
+            args=(func, args, error_queue, return_queue, procs_env_list[i]))
+        process.daemon = daemon
+        process.start()
+        error_queues.append(error_queue)
+        return_queues.append(return_queue)
+        processes.append(process)
+
+    context = MultiprocessContext(processes, error_queues, return_queues)
+    if not join:
+        return context
+
+    # loop until all process end
+    while not context.join():
+        pass
+
+    # finally return context
+    return context
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 7c8fa257f778e71cab35054c3f9d63faaa33de47..1fa307c4d1b89d4033a8f8346b254177053e9dc0 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -327,6 +327,17 @@ def find_free_ports(num):
     return None
 
 
+def _prepare_trainer_env(cluster, trainer):
+    proc_env = {
+        "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
+        "PADDLE_TRAINER_ID": "%d" % trainer.rank,
+        "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+        "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+        "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+    }
+    return proc_env
+
+
 class TrainerProc(object):
     def __init__(self):
         self.proc = None
@@ -352,14 +363,7 @@ def start_local_trainers(cluster,
 
     procs = []
     for idx, t in enumerate(pod.trainers):
-        proc_env = {
-            "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
-            "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
-            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
-        }
-
+        proc_env = _prepare_trainer_env(cluster, t)
         current_env.update(proc_env)
 
         logger.debug("trainer proc env:{}".format(current_env))
diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index 2b331308de5ee9a8aa52a9e303bfbcf8d4264d5f..a5f08ca969ac43f47899395aeb588ddaf2f1e394 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -92,9 +92,11 @@ class TestWeightDecay(unittest.TestCase):
         return param_sum
 
     def check_weight_decay(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
+
         with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
                 name="words", shape=[1], dtype="int64", lod_level=1)
@@ -113,9 +115,11 @@ class TestWeightDecay(unittest.TestCase):
         return param_sum
 
     def check_weight_decay2(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
+
         with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
                 name="words", shape=[1], dtype="int64", lod_level=1)
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 0c4a1964838c608fc5dd46a1dfb16d3d3d7b6ed9..2174dbd31b8fb1ae97894699e03e25e809085cc8 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -327,19 +327,19 @@ def grad(outputs,
     This API computes the sum of gradients of `outputs` with respect to each `inputs` .
 
     Parameters:
-        outputs (Variable|list(Variable)|tuple(Variable)): the output Variable or 
-            Variable list/tuple of the graph to compute gradients.
-        inputs (Variable|list(Variable)|tuple(Variable)): the input Variable or 
-            Variable list/tuple of the graph to compute gradients. The returned
+        outputs (Tensor|list(Tensor)|tuple(Tensor)): the output Tensor or 
+            Tensor list/tuple of the graph to compute gradients.
+        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
+            Tensor list/tuple of the graph to compute gradients. The returned
             values of this API are the gradients of `inputs` . 
-        grad_outputs (Variable|list(Variable|None)|tuple(Variable|None), optional): 
+        grad_outputs (Tensor|list(Tensor|None)|tuple(Tensor|None), optional): 
             initial gradient values of `outputs` . If `grad_outputs` is None, 
             the initial gradient values of `outputs` would be Tensors filled with 1; 
             if `grad_outputs` is not None, it must have the same length as `outputs` , 
             and in this case, the initial gradient value of the i-th `outputs` would
             be: (1) a Tensor filled with 1 when the i-th element of `grad_outputs` 
             is None; (2) the i-th element of `grad_outputs` when the i-th element of
-            `grad_outputs` is a Variable. Default None.
+            `grad_outputs` is a Tensor. Default None.
         retain_graph (bool, optional): whether to retain the forward graph which 
             is used to calculate the gradient. When it is True, the graph would 
             be retained, in which way users can calculate backward twice for the 
@@ -351,21 +351,21 @@ def grad(outputs,
             computing process would be discarded. Default False.
         only_inputs (bool, optional): whether to only compute the gradients of
             `inputs` . If it is False, the gradients of all remaining leaf 
-            Variables in the graph would be also computed and accumulated. 
+            Tensors in the graph would be also computed and accumulated. 
             If it is True, only the gradients of `inputs` would be computed.
             Default True. only_inputs=False is under development, and it is
             not supported yet.    
         allow_unused (bool, optional): whether to raise error or return None if some 
-            Variables of `inputs` are unreachable in the graph. If some Variables of 
+            Tensors of `inputs` are unreachable in the graph. If some Tensors of 
             `inputs` are unreachable in the graph (i.e., their gradients are None),  
             error would be raised if allow_unused=False, or None would be returned as
             their gradients if allow_unused=True. Default False.
-        no_grad_vars (Variable|list(Variable)|tuple(Variable)|set(Variable), optional): 
-            the Variables whose gradients are not needed to compute. Default None.
+        no_grad_vars (Tensor|list(Tensor)|tuple(Tensor)|set(Tensor), optional): 
+            the Tensors whose gradients are not needed to compute. Default None.
 
     Returns:
-        tuple: a tuple of Variables, whose length is the same as the Variable number 
-        inside `inputs`, and the i-th returned Variable is the sum of gradients of 
+        tuple: a tuple of Tensors, whose length is the same as the Tensor number 
+        inside `inputs`, and the i-th returned Tensor is the sum of gradients of 
         `outputs` with respect to the i-th `inputs`.
 
     Examples 1:
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 54d2cda4ca6858c46140e1fbf6ac8860c3a7c78d..bd578e6ba98a0f31a952bd5620b90e9464fe8666 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -11,21 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 import six
 import numpy as np
+import warnings
 from collections import OrderedDict
-from .. import core
-from . import layers
-from . import parallel_helper
-from .. import framework
-from . import to_variable, no_grad
+
+from paddle.fluid import core
+from paddle.fluid import framework
+from paddle.fluid.dygraph import layers
+from paddle.fluid.dygraph import parallel_helper
+from paddle.fluid.dygraph import to_variable, no_grad
+from paddle.utils import deprecated
 
 __all__ = ["prepare_context", "ParallelEnv", "DataParallel"]
 
 ParallelStrategy = core.ParallelStrategy
 
 
+@deprecated(since="2.0.0", update_to="paddle.distributed.init_parallel_env")
 def prepare_context(strategy=None):
     '''
     :api_attr: imperative
@@ -39,17 +44,18 @@ def prepare_context(strategy=None):
     if strategy.nranks < 2:
         return
     assert framework.in_dygraph_mode() is True, \
-        "dygraph.prepare_context should be used with dygrahp mode."
+        "dygraph.prepare_context should be used with dygraph mode."
     place = framework._current_expected_place()
     assert place is not None, \
         "dygraph.prepare_context should be used in fluid.dygraph.guard(place) guard."
-    if isinstance(place, core.CUDAPlace):
-        parallel_helper._set_parallel_ctx(
-            core.NCCLParallelContext(strategy, place))
-    else:
-        # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
-        assert ("Only support CUDAPlace for now.")
-    parallel_helper._init_parallel_ctx()
+    if not parallel_helper._is_parallel_ctx_initialized():
+        if isinstance(place, core.CUDAPlace):
+            parallel_helper._set_parallel_ctx(
+                core.NCCLParallelContext(strategy, place))
+        else:
+            # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
+            assert ("Only support CUDAPlace for now.")
+        parallel_helper._init_parallel_ctx()
     return strategy
 
 
@@ -112,84 +118,84 @@ class ParallelEnv(object):
     """
 
     def __init__(self):
-        self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
-        self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-        self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        self._device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                             "").split(",")
         self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
 
     @property
-    def nranks(self):
+    def rank(self):
         """
-        The number of trainers, generally refers to the number of GPU cards used in training.
+        Rank of current trainer.
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM. The default value is 1.
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . The default value is 0.
 
         Examples:
           .. code-block:: python
 
-            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
-            import paddle.fluid as fluid
+            # execute this command in terminal: export PADDLE_TRAINER_ID=0
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
-            print("The nranks is %d" % env.nranks)
-            # The nranks is 4
+            env = dist.ParallelEnv()
+            print("The rank is %d" % env.rank)
+            # The rank is 0
         """
-        return self._nranks
+        return self._rank
 
     @property
-    def local_rank(self):
+    def world_size(self):
         """
-        The current trainer number.
+        The number of trainers (number of processes participating in current job).
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINER_ID. The default value is 0.
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . The default value is 1.
 
         Examples:
           .. code-block:: python
 
-            # execute this command in terminal: export PADDLE_TRAINER_ID=0
-            import paddle.fluid as fluid
+            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
-            print("The local rank is %d" % env.local_rank)
-            # The local rank is 0
+            env = dist.ParallelEnv()
+            print("The world_size is %d" % env.world_size)
+            # The world_size is 4
         """
-        return self._local_rank
+        return self._world_size
 
     @property
-    def dev_id(self):
+    def device_id(self):
         """
         The ID of selected GPU card for parallel training.
 
-        Its value is equal to the value of the environment variable FLAGS_selected_gpus. The default value is 0.
+        Its value is equal to the value of the environment variable ``FLAGS_selected_gpus`` . The default value is 0.
 
         Examples:
           .. code-block:: python
 
             # execute this command in terminal: export FLAGS_selected_gpus=1
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
-            print("The device id are %d" % env.dev_id)
+            env = dist.ParallelEnv()
+            print("The device id are %d" % env.device_id)
             # The device id are 1
         """
-        return self._dev_id
+        return self._device_id
 
     @property
     def current_endpoint(self):
         """
         The endpoint of current trainer, it is in the form of (node IP + port).
 
-        Its value is equal to the value of the environment variable PADDLE_CURRENT_ENDPOINT. The default value is "".
+        Its value is equal to the value of the environment variable ``PADDLE_CURRENT_ENDPOINT`` . The default value is "".
 
         Examples:
           .. code-block:: python
             
             # execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
+            env = dist.ParallelEnv()
             print("The current endpoint are %s" % env.current_endpoint)
             # The current endpoint are 127.0.0.1:6170
         """
@@ -201,20 +207,25 @@ class ParallelEnv(object):
         The endpoints of all trainer nodes in the task, 
         which are used to broadcast the NCCL ID when NCCL2 is initialized.
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINER_ENDPOINTS. The default value is "".
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ENDPOINTS`` . The default value is "".
 
         Examples:
           .. code-block:: python
 
             # execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
+            env = dist.ParallelEnv()
             print("The trainer endpoints are %s" % env.trainer_endpoints)
             # The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171']
         """
         return self._trainer_endpoints
 
+    # [aliases] Compatible with old method names
+    local_rank = rank
+    nranks = world_size
+    dev_id = device_id
+
 
 # NOTE: [ Compatible ] Originally this class name is `Env`. The semantics of the old class names
 # are inaccurate and may confuse users, so replace it with `ParallelEnv`, but to be compatible
@@ -227,61 +238,98 @@ class DataParallel(layers.Layer):
     Run the dygraph module with data parallelism.
 
     Currently, DataParallel class only supports to run the dynamic graph
-    with multi-process. The usage is:
-    `python -m paddle.distributed.launch --selected_gpus=0,1 dynamic_graph_test.py`.
-    And the content of `dynamic_graph_test.py` is the code of examples.
+    with multi-process. 
+    
+    Now supports two ways to start training:
+
+    1. start by ``paddle.distributed.spawn`` method, for example:
+
+        ``python demo.py`` (spawn need to be called in ``__main__`` method)
+    
+    2. start by ``paddle.distributed.launch`` module, for example:
+    
+        ``python -m paddle.distributed.launch --selected_gpus=0,1 demo.py`` .
+
+    And the content of `demo.py` is the code of examples.
 
     Args:
         layers(Layer): The module that should be executed by data parallel.
-        strategy(ParallelStrategy): The strategy of data parallelism, contains 
-            environment configuration related to parallel execution.
-
+        strategy(ParallelStrategy, optional): (deprecated) The strategy of data parallelism, 
+            contains environment configuration related to parallel execution. Default: None.
+            
     Returns:
         Layer: The data paralleled module.
 
     Examples:
         .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
 
-            place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-            with fluid.dygraph.guard(place):
-
-                # prepare the data parallel context
-                strategy = fluid.dygraph.prepare_context()
-
-                linear = fluid.dygraph.Linear(1, 10, act="softmax")
-                adam = fluid.optimizer.AdamOptimizer(
-                    learning_rate=0.001, parameter_list=linear.parameters())
-
-                # make the module become the data parallelism module
-                linear = fluid.dygraph.DataParallel(linear, strategy)
-
-                x_data = np.random.random(size=[10, 1]).astype(np.float32)
-                data = fluid.dygraph.to_variable(x_data)
-
-                hidden = linear(data)
-                avg_loss = fluid.layers.mean(hidden)
-
-                # scale the loss according to the number of trainers.
-                avg_loss = linear.scale_loss(avg_loss)
-
-                avg_loss.backward()
-
-                # collect the gradients of trainers.
-                linear.apply_collective_grads()
-
-                adam.minimize(avg_loss)
-                linear.clear_gradients()
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                dist.init_parallel_env()
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = paddle.DataParallel(layer)
+
+                loss_fn = nn.MSELoss()
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                # 1. start by ``paddle.distributed.spawn`` (default)
+                dist.spawn(train, nprocs=2)
+                # 2. start by ``paddle.distributed.launch``
+                # train()
     """
 
-    def __init__(self, layers, strategy):
+    def __init__(self, layers, strategy=None):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
         self._layers = layers
-        self._strategy = strategy
+
+        # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
+        # It just stores some environment variables, which can be constructed by 
+        # ParallelEnv. Here it is set as an optional argument.
+        # This parameter is not removed because of compatibility with 1.x writing.
+        if strategy is not None:
+            self._strategy = strategy
+        else:
+            self._strategy = ParallelStrategy()
+            self._strategy.nranks = ParallelEnv().nranks
+            self._strategy.local_rank = ParallelEnv().local_rank
+            self._strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+            self._strategy.current_endpoint = ParallelEnv().current_endpoint
 
     def forward(self, *inputs, **kwargs):
         return self._layers(*inputs, **kwargs)
diff --git a/python/paddle/fluid/dygraph/parallel_helper.py b/python/paddle/fluid/dygraph/parallel_helper.py
index f378211de2b8a1579ab139318cdc3cb8d5bdc2de..ff1675f0ae8a40b2487d5834b262a1b730641262 100644
--- a/python/paddle/fluid/dygraph/parallel_helper.py
+++ b/python/paddle/fluid/dygraph/parallel_helper.py
@@ -23,6 +23,11 @@ def _is_data_parallel_mode():
         os.getenv("PADDLE_TRAINERS_NUM", "1")) > 1
 
 
+def _is_parallel_ctx_initialized():
+    global __parallel_ctx__clz__
+    return __parallel_ctx__clz__ is not None
+
+
 def _set_parallel_ctx(nccl_parallel_context):
     global __parallel_ctx__clz__
     assert __parallel_ctx__clz__ is None, \
diff --git a/python/paddle/fluid/generator.py b/python/paddle/fluid/generator.py
index e11b2e484dce1dd4260b3052d0f0a58f3cfc420a..98924f801413bcd822a0d9a6fd61adcc4d00fddc 100644
--- a/python/paddle/fluid/generator.py
+++ b/python/paddle/fluid/generator.py
@@ -17,44 +17,28 @@ from . import core
 
 __all__ = ['Generator']
 
-default_rng_seed_val = 34342423252
 
-
-class Generator(object):
+class Generator(core.Generator):
     """Generator class"""
 
-    def __init__(self, device="CPU"):
-        """init"""
-        self.device = device
-        seed_in = default_rng_seed_val
-        if self.device == "CPU":
-            self.generator = core.Generator()
-            # self.generator.manual_seed(seed_in)
-        else:
-            raise ValueError(
-                "generator class with device %s does not exist, currently only support generator with device 'CPU' "
-                % device)
-
-    def get_state(self):
-        return self.generator.get_state()
-
-    def set_state(self, state):
-        self.generator.set_state(state)
+    def __init__(self, place=None):
+        """
+        Create a generator object which manages the random number generation. ( Experimental Feature )
 
-    def manual_seed(self, seed):
-        self.generator.manual_seed(seed)
+        Parameters:
+            place(CPUPlace|CUDAPinnedPlace|CUDAPlace, optional): The place to allocate Tensor. Can be  
+                CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place.
 
-    def seed(self):
-        return self.generator.seed()
+        Returns:
+            Generator: A generator object.
 
-    def initial_seed(self):
-        return self.generator.initial_seed()
-
-    def random(self):
-        return self.generator.random()
-
-    def get_cpu_engine(self):
-        return self.generator.get_cpu_engine()
-
-    def set_cpu_engine(self, cpu_engine):
-        self.generator.set_cpu_engine(cpu_engine)
+        """
+        self.place = place
+        if not place:
+            place = core.CPUPlace()
+        if isinstance(place, core.CPUPlace):
+            super(Generator, self).__init__()
+        else:
+            raise ValueError(
+                "Generator class with %s does is not supported yet, currently only support generator with CPUPlace "
+                % place)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ec71e4c9912295ca0844ce91dd2e06e03d9a216d..39c4df00657daccb88ae1ad95781891c4c6ec11e 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1858,6 +1858,7 @@ def conv3d(input,
     return helper.append_activation(pre_act)
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.pool2d")
 @templatedoc()
 def pool2d(input,
            pool_size=-1,
@@ -2075,6 +2076,7 @@ def pool2d(input,
     return pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.pool3d")
 @templatedoc()
 def pool3d(input,
            pool_size=-1,
@@ -2303,6 +2305,7 @@ def pool3d(input,
     return pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.adaptive_pool2d")
 @templatedoc(op_type="pool2d")
 def adaptive_pool2d(input,
                     pool_size,
@@ -2450,6 +2453,7 @@ def adaptive_pool2d(input,
     return (pool_out, mask) if require_index else pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.adaptive_pool3d")
 @templatedoc(op_type="pool3d")
 def adaptive_pool3d(input,
                     pool_size,
@@ -10205,6 +10209,7 @@ def unstack(x, axis=0, num=None):
     return outs
 
 
+@deprecated(since='2.0.0', update_to="paddle.expand")
 def expand(x, expand_times, name=None):
     """
     :alias_main: paddle.expand
@@ -10312,6 +10317,7 @@ def expand(x, expand_times, name=None):
     return out
 
 
+@deprecated(since='2.0.0', update_to="paddle.expand_as")
 def expand_as(x, target_tensor, name=None):
     """
     :alias_main: paddle.expand_as
@@ -10377,6 +10383,9 @@ def expand_as(x, target_tensor, name=None):
         #(3,20)
 
     """
+    if in_dygraph_mode():
+        return core.ops.expand_as(x, target_tensor)
+
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int32', 'int64', 'bool'], 'expand_as')
     check_variable_and_dtype(target_tensor, 'target_tensor',
diff --git a/python/paddle/fluid/tests/unittests/dist_test.sh b/python/paddle/fluid/tests/unittests/dist_test.sh
index 42566f63b68e2c24c459c0bd738455d1ec3bc3da..d5a6490042b20a4f9160c55fbb93e9f2f8092eae 100644
--- a/python/paddle/fluid/tests/unittests/dist_test.sh
+++ b/python/paddle/fluid/tests/unittests/dist_test.sh
@@ -61,7 +61,14 @@ for i in {1..2}; do
     fi
 done
 
+echo "dist space:"
+df -h
+
 #display /tmp/files
+echo "ls /tmp/paddle.*"
 ls -l /tmp/paddle.*
 
+echo "ls -l ./"
+ls -l ./
+
 exit 1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index dd58a49bb55c24a5e126965bff415d9a54cff5ad..af7e73c41464dbd26c476f20d4a1533e37d34ce3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -15,6 +15,7 @@
 import math
 import numpy as np
 import unittest
+import paddle
 from paddle.jit import to_static
 import paddle.fluid as fluid
 from paddle.fluid import ParamAttr
@@ -560,8 +561,8 @@ def train_bmn(args, place, to_static):
     loss_data = []
 
     with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
-        fluid.default_startup_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         global local_random
         local_random = np.random.RandomState(SEED)
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 0e2bac9fa5b5c9e47ce8a08b0187531a3b83dcee..4d735b565ddbcd0bea4e879f0ae5881e459c8f1d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -21,6 +21,7 @@ import unittest
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = "2"
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph import Embedding, Linear, GRUUnit
@@ -448,8 +449,8 @@ def do_train(args, to_static):
     place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
     ) else fluid.CPUPlace()
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         reader = get_random_input_data(args.batch_size, args.vocab_size,
                                        args.num_labels)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 5ec3de5871dd6787c06938a8b771f7d14e54e1e0..a377075062b268723aaa3cb17bfa25d6b181798d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -14,6 +14,7 @@
 
 import time
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
@@ -447,8 +448,8 @@ def train_mobilenet(args, to_static):
     with fluid.dygraph.guard(args.place):
 
         np.random.seed(SEED)
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         if args.model == "MobileNetV1":
             net = MobileNetV1(class_dim=args.class_dim, scale=1.0)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
index 790319936ac015db09e45b9eac799b3bdf0b0250..df2b69297bb4d9167fa3f1a1fe0005a77ededf8a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -19,7 +19,7 @@ import time
 import unittest
 
 import numpy as np
-
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.base import to_variable
@@ -218,8 +218,8 @@ def train(place):
     batch_num = 200
 
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         ptb_model = PtbModel(
             hidden_size=hidden_size,
             vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
index 4813930159744fae362aec7563ea5cda82d958c5..1d211197ebd48f1d0fba87ef807c61a3315ed153 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@@ -16,6 +16,7 @@ import gym
 import math
 import itertools
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.dygraph.nn as nn
 from paddle.fluid.dygraph import to_variable, Layer
@@ -64,8 +65,8 @@ def train(args, place, to_static):
     env.seed(SEED)
 
     with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
-        fluid.default_startup_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         local_random = np.random.RandomState(SEED)
 
         policy = Policy()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 46eb2b42e9265ac7f6340ee0be3a7127e5246eef..6556b2f03bd5304e290792d07d1d969ab255bfdc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -215,8 +215,8 @@ def train(to_static):
     """
     with fluid.dygraph.guard(place):
         np.random.seed(SEED)
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         train_reader = paddle.batch(
             reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index 30cba78fec19c169966e85ff43e79c3a00889616..38e4d5ad5480beb195bcc0c3cc21f033df8fbd5d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -331,8 +331,8 @@ def train(train_reader, to_static):
     np.random.seed(SEED)
 
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         se_resnext = SeResNeXt()
         optimizer = optimizer_setting(train_parameters, se_resnext.parameters())
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
index fd5a58be26be43996bbb1f80557512bf974de52f..2aa3396fb7f8534374746329af43d4f823e4d5cf 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
@@ -15,6 +15,7 @@ import time
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.nn import Conv2D, Linear, Embedding
 from paddle.fluid.dygraph import to_variable, ProgramTranslator, declarative
@@ -285,8 +286,8 @@ def train(args, to_static):
 
     with fluid.dygraph.guard(place):
         np.random.seed(SEED)
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         train_reader = fake_data_reader(args.class_num, args.vocab_size,
                                         args.batch_size, args.padding_size)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
index 552a6307f33378e7b35f84e048729d22a063c796..14b9ac2e99584b35ec0821949b56b6ee92076571 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
@@ -108,8 +108,8 @@ def train(conf_dict, to_static):
         place = fluid.CPUPlace()
 
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         conf_dict['dict_size'] = len(vocab)
         conf_dict['seq_len'] = args.seq_len
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
index 7aa465949eb704ee5f23b3ad44f3fc57adb04154..4fc8d27d30cb8f67c30bbcd8dcd30938f906462d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
@@ -18,6 +18,7 @@ import time
 import unittest
 
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 
 import transformer_util as util
@@ -31,10 +32,11 @@ STEP_NUM = 10
 
 
 def train_static(args, batch_generator):
+    paddle.manual_seed(SEED)
+    paddle.framework.random._manual_program_seed(SEED)
     train_prog = fluid.Program()
     startup_prog = fluid.Program()
-    train_prog.random_seed = SEED
-    startup_prog.random_seed = SEED
+
     with fluid.program_guard(train_prog, startup_prog):
         with fluid.unique_name.guard():
             # define input and reader
@@ -128,8 +130,8 @@ def train_static(args, batch_generator):
 def train_dygraph(args, batch_generator):
     with fluid.dygraph.guard(place):
         if SEED is not None:
-            fluid.default_main_program().random_seed = SEED
-            fluid.default_startup_program().random_seed = SEED
+            paddle.manual_seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
         # define data loader
         train_loader = fluid.io.DataLoader.from_generator(capacity=10)
         train_loader.set_batch_generator(batch_generator, places=place)
@@ -220,7 +222,8 @@ def train_dygraph(args, batch_generator):
 
 def predict_dygraph(args, batch_generator):
     with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         # define data loader
         test_loader = fluid.io.DataLoader.from_generator(capacity=10)
@@ -291,7 +294,8 @@ def predict_dygraph(args, batch_generator):
 def predict_static(args, batch_generator):
     test_prog = fluid.Program()
     with fluid.program_guard(test_prog):
-        test_prog.random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         # define input and reader
         input_field_names = util.encoder_data_input_fields + util.fast_decoder_data_input_fields
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
index 13a97fb7478db8acaa46db3b7a6c4341997193eb..bedca412157f0b4d125f75ee5eabd0145411451b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -20,7 +20,7 @@ import random
 import sys
 import time
 import unittest
-
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import declarative, ProgramTranslator, to_variable
 from paddle.fluid.dygraph.nn import Conv2D, BatchNorm, Linear, Pool2D
@@ -272,8 +272,8 @@ def train(args, fake_data_reader, to_static):
     random.seed(0)
     np.random.seed(0)
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = 1000
-        fluid.default_main_program().random_seed = 1000
+        paddle.manual_seed(1000)
+        paddle.framework.random._manual_program_seed(1000)
 
         video_model = TSM_ResNet("TSM", train_config, 'Train')
 
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index ec6b81f138321f2119a5a5aaf4b5ba9ae8f7e69b..9c3ed13cbb0002231888433a451c7d5e7188244d 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import multiprocessing
 import os
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid import compiler
@@ -64,10 +65,11 @@ class TestParallelExecutorBase(unittest.TestCase):
                 feed_data_reader, FeedDataReader
             ), "feed_data_reader must be type of FeedDataReader"
 
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main = fluid.Program()
         startup = fluid.Program()
-        startup.random_seed = 1
-        main.random_seed = 1
+
         with fluid.program_guard(main, startup):
             feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder,
                                               main, method, optimizer)
diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..278d7b27c528803211e21ae7b1f1190e3053bcc4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import numpy as np
+import unittest
+
+import paddle
+
+# used by model.run_trainer in test_dist_base
+from test_dist_base import RUN_STEP
+
+
+# NOTE: compatible TestParallelDyGraphRunnerBase args
+class SpawnAssistTestArgs(object):
+    update_method = "local"
+    trainer_id = 0
+
+
+class TestDistSpawnRunner(unittest.TestCase):
+    def setUp(self):
+        # NOTE(chenweihang): keep consistent with
+        # TestDistBase.check_with_place
+        self.nprocs = 2
+
+    def _run(self, model, args):
+        args.update_method = "local"
+        return model.run_trainer_with_spawn(args)
+
+    def _run_parallel(self, model, args):
+        args.update_method = "nccl2"
+        context = paddle.distributed.spawn(
+            func=model.run_trainer_with_spawn,
+            args=(args, ),
+            nprocs=self.nprocs,
+            join=True)
+        result_list = []
+        for res_queue in context.return_queues:
+            result_list.append(res_queue.get())
+        return result_list
+
+    def check_dist_result_with_spawn(self, test_class, delta=1e-3):
+        # 0. prepare model and args
+        model = test_class()
+        args = SpawnAssistTestArgs()
+
+        # 1. calc signal card loss
+        losses = self._run(model, args)
+
+        # 2. calc multi card loss (nccl mode)
+        dist_losses_list = self._run_parallel(model, args)
+
+        # 3. compare losses
+        for step_id in range(RUN_STEP):
+            loss = losses[step_id]
+            dist_loss_sum = None
+            for dist_losses in dist_losses_list:
+                if dist_loss_sum is None:
+                    dist_loss_sum = np.array(dist_losses[step_id])
+                else:
+                    dist_loss_sum += np.array(dist_losses[step_id])
+            dist_loss = dist_loss_sum / self.nprocs
+            self.assertAlmostEqual(
+                loss,
+                dist_loss,
+                delta=delta,
+                msg="The results of single-card execution and multi-card execution are inconsistent."
+                "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n".
+                format(loss, dist_loss))
diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
index 969a7da3b71b69296f3313342adbf989c60edb50..2c6c018b9dfac13d97c242e1f36adbddf9dbf3f1 100644
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
 
 
 class TestAdadeltaOp1(OpTest):
@@ -108,5 +110,54 @@ class TestAdadeltaOp2(OpTest):
         self.check_output()
 
 
+class TestAdadeltaV2(unittest.TestCase):
+    def test_adadelta_dygraph(self):
+        paddle.disable_static(paddle.CPUPlace())
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Adadelta(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adadelta(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.Adadelta(learning_rate=0.1)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(ValueError, paddle.optimizer.Adadelta, None)
+        self.assertRaises(
+            ValueError, paddle.optimizer.Adadelta, learning_rate=0.1, rho=None)
+        self.assertRaises(
+            ValueError,
+            paddle.optimizer.Adadelta,
+            learning_rate=0.1,
+            epsilon=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a135cea52903a0d896df2d446b58d99e5a18993
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def avg_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        field_size = (r_end - r_start) \
+            if (exclusive or adaptive) else (ksize[0])
+        if data_type == np.int8 or data_type == np.uint8:
+            out[:, :, i] = (np.rint(
+                np.sum(x_masked, axis=(2, 3)) / field_size)).astype(data_type)
+        else:
+            out[:, :, i] = (np.sum(x_masked, axis=(2)) /
+                            field_size).astype(data_type)
+    return out
+
+
+class TestPool1d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_adaptive_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def test_adaptive_avg_pool1d(self):
+        for place in self.places:
+            self.check_adaptive_avg_dygraph_results(place)
+            self.check_adaptive_avg_static_results(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..875fdf9e9c3f9a9b891ecc6911dfeda788eee271
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def max_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        out[:, :, i] = np.max(x_masked, axis=(2))
+    return out
+
+
+class TestPool1d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_adaptive_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def test_adaptive_max_pool1d(self):
+        for place in self.places:
+            self.check_adaptive_max_dygraph_results(place)
+            self.check_adaptive_max_static_results(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d78788eb1e7c63be485210780db25e1de6fd84b4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
+                            pool_type="max"):
+
+    N = x.shape[0]
+    C, H, W = [x.shape[1], x.shape[2], x.shape[3]] if data_format == 'NCHW' \
+        else [x.shape[3], x.shape[1], x.shape[2]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        output_size = [H_out, W_out]
+    else:
+        H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = H
+        H_out = H
+    if output_size[1] == None:
+        output_size[1] = W
+        W_out = W
+
+    out = np.zeros((N, C, H_out, W_out)) if data_format=='NCHW' \
+        else np.zeros((N, H_out, W_out, C))
+
+    for i in range(H_out):
+        in_h_start = adaptive_start_index(i, H, output_size[0])
+        in_h_end = adaptive_end_index(i, H, output_size[0])
+
+        for j in range(W_out):
+            in_w_start = adaptive_start_index(j, W, output_size[1])
+            in_w_end = adaptive_end_index(j, W, output_size[1])
+
+            if data_format == 'NCHW':
+                x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
+                elif pool_type == 'max':
+                    out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+            elif data_format == 'NHWC':
+                x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
+                elif pool_type == 'max':
+                    out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
+    return out
+
+
+class TestAdaptiveMaxPool2dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="max")
+        """
+        self.res_4_np = adaptive_pool2d_forward(
+            x=self.x_np,
+            output_size=[3, 3],
+            pool_type="max",
+            data_format="NHWC")
+        """
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[2, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool2d(
+            #    x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[None, 3])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, return_indices=False, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[2, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool2d(
+            #    x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[None, 3])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="max")
+
+        #self.res_4_np = adaptive_pool2d_forward(
+        #    x=self.x_np,
+        #    output_size=[3, 3],
+        #    pool_type="max",
+        #    data_format="NHWC")
+
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[2, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #    adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            #        output_size=[3, 3], data_format="NHWC")
+            #    out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_max_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[2, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            #    output_size=[3, 3], data_format="NHWC")
+            #out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_max_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
new file mode 100755
index 0000000000000000000000000000000000000000..a7de0a5c6a7017617124b893313e0f9830cc09f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
@@ -0,0 +1,293 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool3d_forward(x,
+                            output_size,
+                            adaptive=True,
+                            data_format='NCDHW',
+                            pool_type='max'):
+
+    N = x.shape[0]
+    C, D, H, W = [x.shape[1], x.shape[2], x.shape[3], x.shape[4]] \
+        if data_format == 'NCDHW' else [x.shape[4], x.shape[1], x.shape[2],x.shape[3]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        D_out = output_size
+        output_size = [D_out, H_out, W_out]
+    else:
+        D_out, H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = D
+        D_out = D
+    if output_size[1] == None:
+        output_size[1] = H
+        H_out = H
+    if output_size[2] == None:
+        output_size[2] = W
+        W_out = W
+
+    out = np.zeros((N, C, D_out, H_out, W_out)) if data_format=='NCDHW' \
+        else np.zeros((N, D_out, H_out, W_out, C))
+    for k in range(D_out):
+        d_start = adaptive_start_index(k, D, output_size[0])
+        d_end = adaptive_end_index(k, D, output_size[0])
+
+        for i in range(H_out):
+            h_start = adaptive_start_index(i, H, output_size[1])
+            h_end = adaptive_end_index(i, H, output_size[1])
+
+            for j in range(W_out):
+                w_start = adaptive_start_index(j, W, output_size[2])
+                w_end = adaptive_end_index(j, W, output_size[2])
+
+                if data_format == 'NCDHW':
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
+                                 w_end]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, :, k, i, j] = np.sum(x_masked,
+                                                    axis=(2, 3, 4)) / field_size
+                    elif pool_type == 'max':
+                        out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+
+                elif data_format == 'NDHWC':
+                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
+                                 w_end, :]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, k, i, j, :] = np.sum(x_masked,
+                                                    axis=(1, 2, 3)) / field_size
+                    elif pool_type == 'max':
+                        out[:, k, i, j, :] = np.max(x_masked, axis=(1, 2, 3))
+    return out
+
+
+class TestAdaptiveMaxPool3dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="max")
+
+        self.res_4_np = adaptive_pool3d_forward(
+            x=self.x_np,
+            output_size=[3, 3, 3],
+            pool_type="max",
+            data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool3d(
+            #    x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool3d(
+            #    x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="max")
+
+        # self.res_4_np = adaptive_pool3d_forward(
+        #     x=self.x_np,
+        #     output_size=[3, 3, 3],
+        #     pool_type="max",
+        #     data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            #         output_size=[3, 3, 3], data_format="NDHWC")
+            #     out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_max_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #     assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            #         output_size=[3, 3, 3], data_format="NDHWC")
+            #     out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_max_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #     assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index c6d3c6e7d0492b2f4a98a595f015e3b9f4a19e24..5c705378e515eec4c950f6996e2789df603fcda3 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -85,10 +85,35 @@ class TestBatchNorm(unittest.TestCase):
                     y = bn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
+            def compute_v3(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(1.0),
+                            trainable=False),
+                        bias_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(0.0),
+                            trainable=False),
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v4(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2d(
+                        shape[1], weight_attr=False, bias_attr=False)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
             x = np.random.randn(*shape).astype("float32")
             y1 = compute_v1(x, False, False)
             y2 = compute_v2(x)
+            y3 = compute_v3(x, False, False)
+            y4 = compute_v4(x)
             self.assertTrue(np.allclose(y1, y2))
+            self.assertTrue(np.allclose(y3, y4))
 
     def test_static(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 671efd8c721550256c181059528bead43deb0718..43d485a0a6d24be6e8db32f16fe96a70bb229858 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import Parameter
 import numpy as np
@@ -44,10 +45,10 @@ class InplaceTestBase(unittest.TestCase):
 
     def build_program_and_scope(self):
         self.place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         startup_program = fluid.Program()
         main_program = fluid.Program()
-        startup_program.random_seed = 1
-        main_program.random_seed = 1
 
         scope = fluid.Scope()
         with fluid.program_guard(main_program, startup_program):
diff --git a/python/paddle/fluid/tests/unittests/test_compiled_program.py b/python/paddle/fluid/tests/unittests/test_compiled_program.py
index 8430f39578047facdeae6535db134152a7038717..751fed2e56126909d36670b56afbeee9bf2a694f 100644
--- a/python/paddle/fluid/tests/unittests/test_compiled_program.py
+++ b/python/paddle/fluid/tests/unittests/test_compiled_program.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from test_imperative_base import new_program_scope
@@ -29,8 +30,8 @@ class TestCompiledProgram(unittest.TestCase):
         self.label = np.random.randint(
             low=0, high=10, size=[16, 1], dtype=np.int64)
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -46,8 +47,8 @@ class TestCompiledProgram(unittest.TestCase):
 
     def test_compiled_program_base(self):
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -64,8 +65,8 @@ class TestCompiledProgram(unittest.TestCase):
 
     def test_compiled_program_with_data_parallel(self):
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
             exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index a16f21c0f97c0902dd6c26561ed3f707b28ff947..cc0f3745bbf7bb1fae0c0ac430491ac2e0d9b55f 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -34,10 +34,10 @@ def random_reader():
 
 
 def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
+    paddle.manual_seed(1)
+    paddle.framework.random._manual_program_seed(1)
     startup_prog = fluid.Program()
     main_prog = fluid.Program()
-    startup_prog.random_seed = 1
-    main_prog.random_seed = 1
 
     with fluid.unique_name.guard():
         with fluid.program_guard(main_prog, startup_prog):
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 74cc87bd9dbd691c6a1683ac44cba246e67c4af2..2919ec5e9ca97b1d59af46a54b2d702cb6de4a14 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -38,9 +38,10 @@ class TestDirectory(unittest.TestCase):
             'paddle.enable_static', 'paddle.disable_static',
             'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad',
             'paddle.no_grad', 'paddle.save', 'paddle.load',
-            'paddle.static.save', 'paddle.static.load', 'paddle.ParallelEnv',
-            'paddle.prepare_context', 'paddle.DataParallel', 'paddle.jit',
-            'paddle.jit.TracedLayer', 'paddle.jit.to_static',
+            'paddle.static.save', 'paddle.static.load',
+            'paddle.distributed.ParallelEnv',
+            'paddle.distributed.prepare_context', 'paddle.DataParallel',
+            'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
             'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig',
             'paddle.NoamDecay', 'paddle.PiecewiseDecay',
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index ba292f2d87c376ace317fc3fb9b81ce5c5596eb2..faff81fa84fb5fa66c9ff14f782d2301e3964672 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -23,8 +23,11 @@ import subprocess
 import six
 import argparse
 import pickle
+import random
 import numpy as np
 import time
+
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 import paddle.fluid.dygraph as dygraph
@@ -382,22 +385,22 @@ class TestParallelDyGraphRunnerBase(object):
         raise NotImplementedError(
             "train_one_loop should be implemented by the child classes.")
 
+    def _get_data(self, batch, args):
+        if args.update_method != "local":
+            new_batch = []
+            for offset, item in enumerate(batch):
+                if offset % 2 == args.trainer_id:
+                    new_batch.append(item)
+            return new_batch
+        else:
+            return batch
+
     def run_trainer(self, args):
 
         seed = 90
         device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         place = fluid.CUDAPlace(device_id)
 
-        def _get_data(batch):
-            if args.update_method != "local":
-                new_batch = []
-                for offset, item in enumerate(batch):
-                    if offset % 2 == args.trainer_id:
-                        new_batch.append(item)
-                return new_batch
-            else:
-                return batch
-
         with fluid.dygraph.guard(place):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
@@ -422,7 +425,7 @@ class TestParallelDyGraphRunnerBase(object):
             out_losses = []
             print_to_err(type(self).__name__, "begin to run dygraph training")
             for step_id, data in enumerate(train_reader()):
-                data = _get_data(data)
+                data = self._get_data(data, args)
                 if step_id == RUN_STEP:
                     break
                 loss = self.run_one_loop(model, opt, data)
@@ -444,6 +447,47 @@ class TestParallelDyGraphRunnerBase(object):
                 model.clear_gradients()
         print_to_out(out_losses)
 
+    def run_trainer_with_spawn(self, args):
+        # 1. enable dygraph
+        paddle.disable_static()
+
+        # 2. init seed
+        seed = 90
+        paddle.static.default_startup_program().random_seed = seed
+        paddle.static.default_main_program().random_seed = seed
+        np.random.seed(seed)
+        random.seed = seed
+        # get trainer id
+        args.trainer_id = paddle.distributed.get_rank()
+
+        # 3. init parallel env
+        if args.update_method == "nccl2":
+            paddle.distributed.init_parallel_env()
+
+        # 4. train model
+        model, train_reader, opt = self.get_model()
+        if args.update_method == "nccl2":
+            model = paddle.DataParallel(model)
+
+        out_losses = []
+        for step_id, data in enumerate(train_reader()):
+            data = self._get_data(data, args)
+            if step_id == RUN_STEP:
+                break
+            loss = self.run_one_loop(model, opt, data)
+            out_losses.append(loss.numpy())
+
+            if args.update_method == "nccl2":
+                loss = model.scale_loss(loss)
+
+            loss.backward()
+            if args.update_method == "nccl2":
+                model.apply_collective_grads()
+
+            opt.minimize(loss)
+            model.clear_gradients()
+        return out_losses
+
 
 def runtime_main(test_class):
     parser = argparse.ArgumentParser(description='Run dist test.')
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
index ae4355ec412c87ff1d947338d4f15256ace539b8..88b496c1d89e63752f93bd2abb77c61af7e86c4d 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -27,6 +27,8 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 
+SEED = 123123111
+
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
     def __init__(self,
@@ -105,12 +107,11 @@ class MNIST(fluid.dygraph.Layer):
 
 class TestDygraphMultiForward(unittest.TestCase):
     def test_mnist_forward_float32(self):
-        seed = 90
         epoch_num = 1
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
 
+        with fluid.dygraph.guard():
+            paddle.manual_seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
             mnist = MNIST()
             sgd = SGDOptimizer(
                 learning_rate=1e-3, parameter_list=mnist.parameters())
@@ -142,9 +143,8 @@ class TestDygraphMultiForward(unittest.TestCase):
                             dy_param_init_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
+            paddle.manual_seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index 6c0bb97bf6f14b2cef5050a99c02c7843b86be92..e0c0277270b406bf745a5293e9c1bcbe158e52d0 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -18,6 +18,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
@@ -465,9 +466,9 @@ class PaddingRNNTestBase(unittest.TestCase):
         pass
 
     def _prepare_program(self, config, parallel=True):
+        paddle.manual_seed(config.random_seed)
         self.main_program = fluid.Program()
         self.startup_program = fluid.Program()
-        self.startup_program.random_seed = config.random_seed
         with fluid.program_guard(self.main_program, self.startup_program):
             with fluid.unique_name.guard():
                 res_vars = lm_model(
diff --git a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
index 5a562dc14650a74ee6f76fa3d8c5f207da6475d6..c18b7c5b044e76fdfbb53b68633cfaf222190a38 100644
--- a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import six
 import unittest
@@ -37,13 +38,13 @@ class TestEmbeddingIdStopGradientBase(unittest.TestCase):
             self.assertTrue(np.array_equal(grad_value1, grad_value2))
 
     def run_program(self, place, stop_gradient=False):
+        np.random.seed(1)
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+
         startup_program = fluid.Program()
         main_program = fluid.Program()
 
-        np.random.seed(1)
-        startup_program.random_seed = 1
-        main_program.random_seed = 1
-
         scope = fluid.Scope()
         with fluid.program_guard(main_program, startup_program):
             with fluid.scope_guard(scope):
diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_op.py
index 69ed9f141437c307dc9e43fb501000d5cafeeaf7..150aff78508c61031a97bb56c9f14c4485cecea1 100755
--- a/python/paddle/fluid/tests/unittests/test_expand_as_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_op.py
@@ -102,8 +102,23 @@ class TestExpandAsOpRank4(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+# Test dygraph API
+class TestExpandAsDygraphAPI(unittest.TestCase):
+    def test_api(self):
+        import paddle
+        paddle.disable_static()
+        np_data_x = np.array([1, 2, 3]).astype('int32')
+        np_data_y = np.array([1, 2, 3, 1, 2, 3]).astype('int32')
+        data_x = paddle.to_tensor(np_data_x)
+        data_y = paddle.to_tensor(np_data_y)
+        out = fluid.layers.expand_as(data_x, data_y)
+        np_out = out.numpy()
+        assert np.array_equal(np_out, np.tile(np_data_x, (2)))
+        paddle.enable_static()
+
+
 # Test python API
-class TestExpandAPI(unittest.TestCase):
+class TestExpandAsAPI(unittest.TestCase):
     def test_api(self):
         input1 = np.random.random([12, 14]).astype("float32")
         input2 = np.random.random([48, 14]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index e5a7e6c702aec114968adb7fb23309c9d944559d..ec30cb70c579092b1ee03b9debc9a26dcb19926e 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import unittest
+import paddle
 import numpy as np
 from op_test import OpTest
 import paddle.fluid as fluid
@@ -135,31 +136,32 @@ class TestFCOpWithPadding(TestFCOp):
 
 class TestFcOp_NumFlattenDims_NegOne(unittest.TestCase):
     def test_api(self):
-        startup_program = Program()
-        main_program = Program()
-        startup_program.random_seed = SEED
-        main_program.random_seed = SEED
-
-        with program_guard(main_program, startup_program):
-            input = np.random.random([2, 2, 25]).astype("float32")
-            x = fluid.layers.data(
-                name="x",
-                shape=[2, 2, 25],
-                append_batch_size=False,
-                dtype="float32")
-
-            out_1 = fluid.layers.fc(input=x, size=1, num_flatten_dims=-1)
-            out_2 = fluid.layers.fc(input=x, size=1, num_flatten_dims=2)
-
-        place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place=place)
-        exe.run(startup_program)
-        res_1, res_2 = exe.run(main_program,
-                               feed={"x": input},
-                               fetch_list=[out_1, out_2])
-
-        assert np.array_equal(res_1, res_2)
+        def run_program(num_flatten_dims):
+            paddle.manual_seed(SEED)
+            startup_program = Program()
+            main_program = Program()
+
+            with program_guard(main_program, startup_program):
+                input = np.random.random([2, 2, 25]).astype("float32")
+                x = fluid.layers.data(
+                    name="x",
+                    shape=[2, 2, 25],
+                    append_batch_size=False,
+                    dtype="float32")
+
+                out = fluid.layers.fc(input=x,
+                                      size=1,
+                                      num_flatten_dims=num_flatten_dims)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = fluid.Executor(place=place)
+            exe.run(startup_program)
+            out = exe.run(main_program, feed={"x": input}, fetch_list=[out])
+
+        res_1 = run_program(-1)
+        res_2 = run_program(2)
+        self.assertTrue(np.array_equal(res_1, res_2))
 
 
 class TestFCOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
index f5e888ab0eb3ca597bf62245ff9f3024fe81ee95..25801793f1f2e70c404727ed4f64c7d3c830aec9 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
@@ -43,7 +43,7 @@ class TestFleetBase(unittest.TestCase):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         strategy = fleet.DistributedStrategy()
-        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index 06f8da84a28d22127fad122d39d4e3903fdf25bf..47671ab3a85e8596d5b677f5e1cf9f6ebecaf155 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net
 from fake_reader import fake_imdb_reader
 from parallel_executor_test_base import TestParallelExecutorBase
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index 62eef67a5695f62c3594824c24886e23a5a59801..921dbdbc6d4e1b169c2c8aa199ea15f886bd0128 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -19,8 +19,6 @@ import unittest
 
 class TestFuseBatchNormActPass(unittest.TestCase):
     def build_program(self, main_program, startup_program, use_cuda, seed=1):
-        main_program.random_seed = seed
-        startup_program.random_seed = seed
         with fluid.program_guard(main_program, startup_program):
             x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
             y = fluid.layers.data(name="y", shape=[1], dtype='int64')
@@ -59,6 +57,8 @@ class TestFuseBatchNormActPass(unittest.TestCase):
         return x, y, loss
 
     def check(self, place, use_cuda):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_program = fluid.Program()
         startup_program = fluid.Program()
         x, y, loss = self.build_program(main_program, startup_program, use_cuda)
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
index b47bcd2a032a32f30b2bcdd2b48541c660abdab2..a22daeedd09e9a1da3a17773fed43d35ece51bec 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net
 from fake_reader import fake_imdb_reader
 from parallel_executor_test_base import TestParallelExecutorBase
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 6b08c4250f61c9680a13b21f1c6c2e940c60ca75..9ab84404073906a8a95f9eb562cbe220e7c6b455 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
@@ -37,6 +37,7 @@ class TestGaussianRandomOp(OpTest):
             "seed": 10,
             "use_mkldnn": self.use_mkldnn
         }
+        paddle.manual_seed(10)
 
         self.outputs = {'Out': np.zeros((123, 92), dtype='float32')}
 
diff --git a/python/paddle/fluid/tests/unittests/test_generator.py b/python/paddle/fluid/tests/unittests/test_generator.py
index 6cc43d3d5498284e8a24dd272eaed08cdf830733..8b1f420358d3187bd4746431fefe449a4d6ed2ec 100644
--- a/python/paddle/fluid/tests/unittests/test_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_generator.py
@@ -16,6 +16,7 @@
 from __future__ import print_function
 import os
 import unittest
+import paddle
 import paddle.fluid.generator as generator
 import time  # temp for debug
 
@@ -34,10 +35,11 @@ class TestGenerator(unittest.TestCase):
         st = gen.get_state()
         gen.set_state(st)
         gen.random()
-        gen.set_cpu_engine(gen.get_cpu_engine())
 
     def test_basic_generator_error(self):
-        self.assertRaises(ValueError, generator.Generator, device="CUDA")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.assertRaises(
+                ValueError, generator.Generator, place=paddle.CUDAPlace(0))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
index 4f0beb8c0dcd5384e7b9f6e30e8082595ac4dc06..7c1ff41f7e7674936f2725dae3ea19f399cf51e4 100644
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
@@ -35,10 +35,10 @@ def random_reader():
 
 
 def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
+    paddle.manual_seed(1)
+    paddle.framework.random._manual_program_seed(1)
     startup_prog = fluid.Program()
     main_prog = fluid.Program()
-    startup_prog.random_seed = 1
-    main_prog.random_seed = 1
 
     with fluid.unique_name.guard():
         with fluid.program_guard(main_prog, startup_prog):
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 5777bb3c6f5e34f035c32ed963906b5ccc03ba85..5c9867e681524f519e267fb744fc4090c836036a 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
@@ -266,8 +267,8 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
 
     def training_test(self, is_sparse):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
+            paddle.manual_seed(1)
             start_up = fluid.default_startup_program()
-            start_up.random_seed = 1  # Fix random seed
             x = np.arange(6).reshape(6)
             path_table = np.array([(1, 2, -1), (1, 2, -1)]).astype('int64')
             path_code = np.array([(1, 0, -1), (0, 0, -1)]).astype('int64')
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index a4f3858d6fb242b8689bd1d300861faf8ed73e54..fdf7adbfb45f0a6133909f4a7d6b488cae09144c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -121,6 +121,7 @@ class TestAmpScaler(unittest.TestCase):
 
         def run_simple_conv(inp_np, use_scaler=True):
             paddle.manual_seed(10)
+            paddle.framework.random._manual_program_seed(10)
             with fluid.dygraph.guard():
                 model = SimpleConv(
                     num_channels=3,
@@ -204,6 +205,7 @@ class TestResnet(unittest.TestCase):
 
         with fluid.dygraph.guard():
             paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet(use_cudnn=True)
             optimizer = optimizer_setting(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
index d3f488d92ac455072b37274e2ce782bcf41e8cc7..428f97c0af8182efdaab31dbd720e523578f2292 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
@@ -43,7 +43,7 @@ class MLP(fluid.Layer):
 class TestDataParallelStateDict(unittest.TestCase):
     def test_data_parallel_state_dict(self):
         with fluid.dygraph.guard():
-            strategy = paddle.prepare_context()
+            strategy = paddle.distributed.prepare_context()
             mlp = MLP()
             parallel_mlp = dygraph.parallel.DataParallel(mlp, strategy)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index af71d9d27b9a349e2b0e08c03dd04e3936d34afb..cc6c2f97a9334bf0c3932ecc1fcc18b0b56b2797 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -206,11 +206,10 @@ class TestDygraphDeepCF(unittest.TestCase):
         else:
             (users_np, items_np, labels_np, num_users, num_items,
              matrix) = get_data()
-
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
         startup = fluid.Program()
-        startup.random_seed = seed
         main = fluid.Program()
-        main.random_seed = seed
 
         scope = fluid.core.Scope()
         with new_program_scope(main=main, startup=startup, scope=scope):
@@ -244,8 +243,8 @@ class TestDygraphDeepCF(unittest.TestCase):
                     sys.stderr.write('static loss %s\n' % static_loss)
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             deepcf = DeepCF(num_users, num_items, matrix)
             adam = fluid.optimizer.AdamOptimizer(
@@ -269,8 +268,8 @@ class TestDygraphDeepCF(unittest.TestCase):
                     sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             deepcf2 = DeepCF(num_users, num_items, matrix)
             adam2 = fluid.optimizer.AdamOptimizer(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index 227cd5d4acb290baeb622a84d729b01bc45d48b1..720c9f95c251ec54c7e7fa74c8e59e135a8c6be7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -312,6 +312,7 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
 
         with fluid.dygraph.guard():
             paddle.manual_seed(123)
+            paddle.framework.random._manual_program_seed(123)
             a = fluid.dygraph.to_variable(value)
             a.stop_gradient = False
 
@@ -328,6 +329,7 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
 
         with fluid.dygraph.guard():
             paddle.manual_seed(123)
+            paddle.framework.random._manual_program_seed(123)
             a = fluid.dygraph.to_variable(value)
             a.stop_gradient = False
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 80bdf2ea8a898716fa20be315ac57371191b1a61..b752b439f0fa945f75c8cb0c1478668e3dd2c6d5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -56,13 +56,11 @@ class Generator(fluid.Layer):
 class TestDygraphGAN(unittest.TestCase):
     def test_gan_float32(self):
         seed = 90
-
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         startup = fluid.Program()
-        startup.random_seed = seed
         discriminate_p = fluid.Program()
         generate_p = fluid.Program()
-        discriminate_p.random_seed = seed
-        generate_p.random_seed = seed
 
         scope = fluid.core.Scope()
         with new_program_scope(
@@ -133,8 +131,8 @@ class TestDygraphGAN(unittest.TestCase):
 
         dy_params = dict()
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
 
             discriminator = Discriminator()
             generator = Generator()
@@ -177,10 +175,9 @@ class TestDygraphGAN(unittest.TestCase):
 
         dy_params2 = dict()
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
-
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
             discriminator2 = Discriminator()
             generator2 = Generator()
             sgd2 = SGDOptimizer(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index 01f3c02774698376c576a446f32634583623a737..4db6f2d0da1d5287a3c9ccca7c5459e5915a514f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -61,12 +61,10 @@ class GCN(fluid.Layer):
 
 class TestDygraphGNN(unittest.TestCase):
     def test_gnn_float32(self):
-        seed = 90
-
+        paddle.manual_seed(90)
+        paddle.framework.random._manual_program_seed(90)
         startup = fluid.Program()
-        startup.random_seed = seed
         main = fluid.Program()
-        main.random_seed = seed
 
         scope = fluid.core.Scope()
         with new_program_scope(main=main, startup=startup, scope=scope):
@@ -114,8 +112,8 @@ class TestDygraphGNN(unittest.TestCase):
                 scope.find_var(model.gc.weight.name).get_tensor())
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(90)
+            paddle.framework.random._manual_program_seed(90)
 
             features = np.ones([1, 100, 50], dtype=np.float32)
             # Use selected rows when it's supported.
@@ -140,8 +138,8 @@ class TestDygraphGNN(unittest.TestCase):
             model_gc_weight_value = model.gc.weight.numpy()
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(90)
+            paddle.framework.random._manual_program_seed(90)
 
             features2 = np.ones([1, 100, 50], dtype=np.float32)
             # Use selected rows when it's supported.
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index 6349d71760934c9da3aed4896ea651c45af657ad..f0fea2d7eb75cff376ebce3505e175030619697c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -94,8 +95,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
 
             for is_sort_sum_gradient in [True, False]:
                 with fluid.dygraph.guard(place):
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
@@ -139,8 +140,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_loss_value = dy_loss.numpy()
 
                 with new_program_scope():
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 499a4311f6e1714b239259d68217370edea20a2f..5400b785d2929b4ff8614d4a6dbe26f197bf5ad1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import six
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit
@@ -401,9 +402,9 @@ class TestDygraphOCRAttention(unittest.TestCase):
                 dtype='int64').reshape([1, Config.max_length])))
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             ocr_attention = OCRAttention()
 
             if Config.learning_rate_decay == "piecewise_decay":
@@ -453,8 +454,8 @@ class TestDygraphOCRAttention(unittest.TestCase):
                         dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             ocr_attention = OCRAttention()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index a7783afc5cff3da97b623aec3297881013724a78..7876675bcc6a1cb5ea190adfa16fb5e4de8c2e35 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -74,8 +74,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
 
         with fluid.dygraph.guard(place):
             try:
-                fluid.default_startup_program().random_seed = seed
-                fluid.default_main_program().random_seed = seed
+                paddle.manual_seed(seed)
+                paddle.framework.random._manual_program_seed(seed)
                 mlp = MLP()
                 optimizer = self.get_optimizer_dygraph(
                     parameter_list=mlp.parameters())
@@ -91,8 +91,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
             ) else fluid.CUDAPlace(0)
 
         with fluid.dygraph.guard(place):
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             mlp = MLP()
             optimizer = self.get_optimizer_dygraph(
@@ -132,8 +132,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             if place == None:
                 place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
index ec331e2e5b3b8ab541d4075852f5ecfe0300e28e..619e9e8e90783365b5f0d718783a14468520c8d4 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -74,8 +74,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
 
         with fluid.dygraph.guard(place):
             try:
-                fluid.default_startup_program().random_seed = seed
-                fluid.default_main_program().random_seed = seed
+                paddle.manual_seed(seed)
+                paddle.framework.random._manual_program_seed(seed)
                 mlp = MLP()
                 optimizer = self.get_optimizer_dygraph(
                     parameter_list=mlp.parameters())
@@ -91,8 +91,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
             ) else fluid.CUDAPlace(0)
 
         with fluid.dygraph.guard(place):
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             mlp = MLP()
             optimizer = self.get_optimizer_dygraph(
@@ -132,8 +132,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             if place == None:
                 place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
@@ -658,7 +658,7 @@ class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
 class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
         optimizer = PipelineOptimizer(optimizer)
         return optimizer
 
@@ -670,7 +670,7 @@ class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
 class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
         optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5)
         return optimizer
 
@@ -682,7 +682,7 @@ class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
 class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
         optimizer = RecomputeOptimizer(optimizer)
         return optimizer
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index bd629f5f4a69a9a8c94f1b2cc58935f0e991ead0..fa23ff8e7c29fa5a07cab03f7407910f687ce9ee 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -225,8 +226,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         traced_layer = None
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -293,8 +294,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             dy_last_hidden_value = last_hidden.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
index 526c1706e2d08bdf779846a6f30706435eb4a503..0487f8dd9a640b7d337dbc603030e6dd6bbe7ef7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -43,9 +44,10 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -93,8 +95,9 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
             dy_last_hidden_value = last_hidden.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
index 735ec4d3f1ea869a17bea4efdba9e5dcedb39fb6..0076c61e584074ed091b9b0c80e9aa5be00e48fb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -64,8 +64,8 @@ class TestImperativeMnist(unittest.TestCase):
         mask = np.array(mask_list).astype("float32")
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             policy = Policy(input_size=4)
 
@@ -105,8 +105,8 @@ class TestImperativeMnist(unittest.TestCase):
                 dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 815437072fde291b8d8348dba0b4b0ae872ec1b9..e8a2298c17d001abeac2f113df08ee01b94c9422 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -251,8 +251,8 @@ class TestDygraphResnet(unittest.TestCase):
         traced_layer = None
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet()
             optimizer = optimizer_setting(
@@ -334,8 +334,8 @@ class TestDygraphResnet(unittest.TestCase):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
index d26d6f25aa8ffbbde3af9148bebba156eeef5e38..13b12da3318cad709b1978dc581ff479a1d842c6 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
@@ -77,9 +77,10 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
         batch_size = train_parameters["batch_size"]
         batch_num = 10
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+
             resnet = ResNet()
             optimizer = optimizer_setting(
                 train_parameters, parameter_list=resnet.parameters())
@@ -136,8 +137,8 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index a2f75089102ebbcdd2753c8fdad6653b511919bd..48aea3a584dd25667704b22d99d1074c481bb76c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -219,8 +219,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -305,8 +305,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -415,8 +415,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -522,8 +522,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -635,8 +635,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -714,8 +712,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -805,9 +803,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
+
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index c1e7fc2b1d3faeceaa90ac03c12819108be2bc38..e81d1c8610f6bebffadf930b67dc14a4a418ef05 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -219,8 +219,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -307,8 +307,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -414,8 +414,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -522,8 +522,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -636,8 +636,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -715,8 +715,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -806,8 +806,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 283addaf6283a5365d983e4737bf2a8fdf5ee0b9..a04e1e4e5aafeeb605348b30125c5d42b3171674 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -308,8 +308,8 @@ class TestImperativeResneXt(unittest.TestCase):
         batch_num = 1
         epoch_num = 1
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             se_resnext = SeResNeXt()
             optimizer = optimizer_setting(
@@ -367,8 +367,8 @@ class TestImperativeResneXt(unittest.TestCase):
                         dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index 3765cb784d6522cd0249a77045f8cbc841a2d9ac..794f59e48507e6002311e54e8ae31f3ad1bf4647 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -101,8 +102,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
             for is_sort_sum_gradient in [True, False]:
                 traced_layer = None
                 with fluid.dygraph.guard(place):
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
@@ -145,8 +146,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_loss_value = dy_loss.numpy()
 
                 with new_program_scope():
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index d603a7d6ca0dea8df2e60207211f2061f1fe616d..e94157fa047eef065bc4bd0bfb3d6b6c778ea7b9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -468,8 +468,8 @@ def build_optimizer(layer, cfg, loss=None):
 
 class DyGraphTrainModel(object):
     def __init__(self, cfg):
-        fluid.default_startup_program().random_seed = cfg.seed
-        fluid.default_main_program().random_seed = cfg.seed
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
 
         self.generator = Generator(cfg)
         self.discriminator = Discriminator(cfg)
@@ -529,12 +529,12 @@ class StaticGraphTrainModel(object):
                 shape=[None, cfg.c_dim], dtype='float32', name='label_trg')
             return image_real, label_org, label_trg
 
+        paddle.manual_seed(cfg.seed)
+        paddle.framework.random._manual_program_seed(cfg.seed)
         self.gen_program = fluid.Program()
         gen_startup_program = fluid.Program()
 
         with fluid.program_guard(self.gen_program, gen_startup_program):
-            self.gen_program.random_seed = cfg.seed
-            gen_startup_program.random_seed = cfg.seed
             with fluid.unique_name.guard():
                 image_real, label_org, label_trg = create_data_layer()
                 generator = Generator(cfg)
@@ -546,8 +546,6 @@ class StaticGraphTrainModel(object):
         self.dis_program = fluid.Program()
         dis_startup_program = fluid.Program()
         with fluid.program_guard(self.dis_program, dis_startup_program):
-            self.dis_program.random_seed = cfg.seed
-            dis_startup_program.random_seed = cfg.seed
             with fluid.unique_name.guard():
                 image_real, label_org, label_trg = create_data_layer()
                 generator = Generator(cfg)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index c59ce44ec96a87383ec12998767af70ac07ff743..9f58ef881e4e47365be03cdb5786ac292c938f03 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Embedding, LayerNorm, Linear, Layer
 from paddle.fluid.dygraph import to_variable, guard
@@ -949,9 +950,9 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
         seed = 90
 
         with guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             transformer = TransFormer(
                 ModelHyperParams.src_vocab_size,
                 ModelHyperParams.trg_vocab_size,
@@ -1034,8 +1035,8 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
             dy_token_num_value = dy_token_num.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             transformer = TransFormer(
                 ModelHyperParams.src_vocab_size,
                 ModelHyperParams.trg_vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
index c5228fcf122748d2518238aa21ea486ed5f60d46..eaa7e711a29c7b96691f630733d913003fce9e43 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
@@ -37,10 +37,10 @@ class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
                                   use_cuda=True,
                                   use_mem_opt=False,
                                   iter_num=5):
+        paddle.manual_seed(100)
+        paddle.framework.random._manual_program_seed(100)
         prog = Program()
         startup_prog = Program()
-        prog.random_seed = 100
-        startup_prog.random_seed = 100
         with program_guard(prog, startup_prog):
             image = layers.data(name='x', shape=[784], dtype='float32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 2b79659b9c6957aa6b141f8b36cb674497cb1392..87b6e76a6d0ab7f5fba7c4526734d81475e1540e 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -18,7 +18,7 @@ import os
 import pickle
 import unittest
 import numpy as np
-
+import paddle
 from paddle.static import InputSpec
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
@@ -80,7 +80,7 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):
 
 def train(layer, input_size=784, label_size=1):
     # create optimizer
-    adam = fluid.optimizer.SGDOptimizer(
+    sgd = fluid.optimizer.SGDOptimizer(
         learning_rate=0.01, parameter_list=layer.parameters())
     # create data loader
     train_loader = fluid.io.DataLoader.from_generator(capacity=5)
@@ -97,7 +97,7 @@ def train(layer, input_size=784, label_size=1):
         avg_loss = fluid.layers.mean(loss)
 
         avg_loss.backward()
-        adam.minimize(avg_loss)
+        sgd.minimize(avg_loss)
         layer.clear_gradients()
     return [img], layer, avg_loss
 
@@ -108,7 +108,8 @@ class TestJitSaveLoad(unittest.TestCase):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
     def train_and_save_model(self, model_path=None, configs=None):
         layer = LinearNet(784, 1)
@@ -149,8 +150,8 @@ class TestJitSaveLoad(unittest.TestCase):
         train_layer.train()
         load_train_layer.train()
         # train & compare
-        _, _, train_loss = train(train_layer)
-        _, _, load_train_loss = train(load_train_layer)
+        img0, _, train_loss = train(train_layer)
+        img1, _, load_train_loss = train(load_train_layer)
         self.assertTrue(
             np.array_equal(train_loss.numpy(), load_train_loss.numpy()))
 
@@ -293,7 +294,8 @@ class TestJitSaveLoadConfig(unittest.TestCase):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
     def basic_save_load(self, layer, model_path, configs):
         # 1. train & save
@@ -385,7 +387,8 @@ class TestJitMultipleLoading(unittest.TestCase):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         # train and save base model
         self.train_and_save_orig_model()
 
@@ -426,7 +429,8 @@ class TestJitPruneModelAndLoad(unittest.TestCase):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
     def train_and_save(self):
         train_layer = LinearNetReturnHidden(8, 8)
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 1992a3bb39807a62966e245d24888cc074746e8d..b76887f0965ca64b2b40bf9c0ce6e82b44fdad2f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -57,8 +57,8 @@ class LayerTest(unittest.TestCase):
     @contextlib.contextmanager
     def static_graph(self):
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             yield
 
     def get_static_graph_result(self,
@@ -77,8 +77,8 @@ class LayerTest(unittest.TestCase):
     def dynamic_graph(self, force_to_use_cpu=False):
         with fluid.dygraph.guard(
                 self._get_place(force_to_use_cpu=force_to_use_cpu)):
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             yield
 
 
@@ -299,7 +299,7 @@ class TestLayer(LayerTest):
                 my_syncbn = paddle.nn.SyncBatchNorm(3)
                 dy_ret = my_syncbn(base.to_variable(t))
                 dy_ret_value = dy_ret.numpy()
-            self.assertTrue(np.array_equal(static_ret, static_ret))
+            self.assertTrue(np.array_equal(static_ret, dy_ret_value))
 
     def test_relu(self):
         with self.static_graph():
@@ -1034,7 +1034,7 @@ class TestLayer(LayerTest):
             static_rlt2 = self.get_static_graph_result(
                 feed=feed_dict, fetch_list=[nce_loss2])[0]
 
-        with self.dynamic_graph(force_to_use_cpu=True):
+        with self.dynamic_graph():
             words = []
             for i in range(window_size):
                 words.append(base.to_variable(inp_word[i]))
@@ -1070,7 +1070,7 @@ class TestLayer(LayerTest):
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
         self.assertTrue(np.allclose(dy_rlt_value, static_rlt))
 
-        with self.dynamic_graph(force_to_use_cpu=True):
+        with self.dynamic_graph():
             custom_weight = np.random.randn(dict_size, 128).astype("float32")
             weight_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(
@@ -1996,13 +1996,13 @@ class TestLayer(LayerTest):
             exe = fluid.Executor(place)
 
             exe.run(fluid.default_startup_program())
-            x = np.random.rand(3, 32, 32).astype("float32")
-            y = np.array([[1], [0], [1]])
+            # x = np.random.rand(3, 32, 32).astype("float32")
+            # y = np.array([[1], [0], [1]])
             static_out = exe.run(feed={"input": x,
                                        "label": y},
                                  fetch_list=result[0])
 
-        with self.dynamic_graph():
+        with self.dynamic_graph(force_to_use_cpu=True):
             data = base.to_variable(x)
             label = base.to_variable(y)
             fc_out = fluid.layers.fc(data, size=10)
diff --git a/python/paddle/fluid/tests/unittests/test_manual_seed.py b/python/paddle/fluid/tests/unittests/test_manual_seed.py
index 747026622e4653491feecb852dde67a19b8ff666..a1d6eb915ce78a2b19092f20f8cfeed0d60979b2 100644
--- a/python/paddle/fluid/tests/unittests/test_manual_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_manual_seed.py
@@ -15,30 +15,33 @@
 from __future__ import print_function
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 from paddle.framework import manual_seed
 from paddle.fluid.framework import Program, default_main_program, default_startup_program
+import numpy as np
 
 
 class TestManualSeed(unittest.TestCase):
     def test_manual_seed(self):
-        local_program = Program()
-        local_main_prog = default_main_program()
-        local_start_prog = default_startup_program()
-
-        self.assertEqual(0, local_program.random_seed)
-        self.assertEqual(0, local_main_prog.random_seed)
-        self.assertEqual(0, local_start_prog.random_seed)
-
-        manual_seed(102)
-        global_program1 = Program()
-        global_program2 = Program()
-        global_main_prog = default_main_program()
-        global_start_prog = default_startup_program()
-        self.assertEqual(102, global_program1.random_seed)
-        self.assertEqual(102, global_program2.random_seed)
-        self.assertEqual(102, global_main_prog.random_seed)
-        self.assertEqual(102, global_start_prog.random_seed)
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(12312321111)
+        x = fluid.layers.gaussian_random([10], dtype="float32")
+        st1 = gen.get_state()
+        x1 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.set_state(st1)
+        x2 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.manual_seed(12312321111)
+        x3 = fluid.layers.gaussian_random([10], dtype="float32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not fluid.core.is_compiled_with_cuda():
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index 77ec6f9b6bcda7568325698634fd4f86557cd1be..a535ef5e60397718e97100332b945b360838bbf4 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -19,6 +19,8 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
 
 
 class TestMomentumOp1(OpTest):
@@ -234,5 +236,48 @@ class TestSparseMomentumOp2(TestSparseMomentumOp):
         self.use_nesterov = True
 
 
+class TestMomentumV2(unittest.TestCase):
+    def test_momentum_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9, parameters=linear.parameters())
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_momentum(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.Momentum(
+                learning_rate=0.1, momentum=0.9)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(
+            ValueError, paddle.optimizer.Momentum, learning_rate=None)
+        self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_normal.py b/python/paddle/fluid/tests/unittests/test_normal.py
index a9d9af4d50be77bd1d2ecc11dd872ef612209f1e..3e6855feaf491727203063f5c75c68301abbe05e 100644
--- a/python/paddle/fluid/tests/unittests/test_normal.py
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
@@ -25,7 +25,7 @@ class TestNormalAPI(unittest.TestCase):
         self.mean = 1.0
         self.std = 0.0
         self.shape = None
-        self.repeat_num = 1000
+        self.repeat_num = 2000
         self.set_attrs()
         self.dtype = self.get_dtype()
         self.place=paddle.CUDAPlace(0) \
@@ -134,8 +134,8 @@ class TestNormalAPI(unittest.TestCase):
                 if isinstance(self.mean, np.ndarray) else self.mean
             std_ref=self.std.reshape([1, -1]) \
                 if isinstance(self.std, np.ndarray) else self.std
-            self.assertTrue(np.allclose(mean_ref, mean, 0.1, 0.1))
-            self.assertTrue(np.allclose(std_ref, std, 0.1, 0.1))
+            self.assertTrue(np.allclose(mean_ref, mean, 0.2, 0.2))
+            self.assertTrue(np.allclose(std_ref, std, 0.2, 0.2))
 
 
 class TestNormalAPI_mean_is_tensor(TestNormalAPI):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index 5677157fde8d718cbeb6691849421efbbb47dc10..bac196b1ab52b604a85321a5473d455d2616bf0d 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -13,11 +13,16 @@
 # limitations under the License.
 
 from __future__ import print_function
+
+import os
+import sys
 import unittest
-from test_dist_base import TestDistBase
+
 import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_mnist import TestMnist
 
-import os
 flag_name = os.path.splitext(__file__)[0]
 
 
@@ -36,5 +41,11 @@ class TestParallelDygraphMnist(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphMnistSpawn(TestDistSpawnRunner):
+    def test_mnist_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(test_class=TestMnist, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
index 8c5cdf8321a4bdff8981b3b0dafed66d067a41e3..cf89dc484c4880092962eb0322b98bac1d15dcd3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
@@ -13,11 +13,16 @@
 # limitations under the License.
 
 from __future__ import print_function
+
+import os
+import sys
 import unittest
-from test_dist_base import TestDistBase
+
 import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_se_resnext import TestSeResNeXt
 
-import os
 flag_name = os.path.splitext(__file__)[0]
 
 
@@ -36,5 +41,12 @@ class TestParallelDygraphSeResNeXt(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphSeResNeXtSpawn(TestDistSpawnRunner):
+    def test_se_resnext_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestSeResNeXt, delta=0.01)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
index 40b5833053d29bb36c5f15aca23aadb761597838..7f051f1005c7b7591bd813e1c4677e383c439ed0 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
@@ -15,10 +15,13 @@
 from __future__ import print_function
 
 import os
+import sys
 import unittest
-import paddle.fluid as fluid
 
+import paddle.fluid as fluid
 from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_sparse_embedding import TestSparseEmbedding
 
 flag_name = os.path.splitext(__file__)[0]
 
@@ -38,5 +41,12 @@ class TestParallelDygraphSparseEmdedding(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphSparseEmdeddingSpawn(TestDistSpawnRunner):
+    def test_sparse_embedding_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestSparseEmbedding, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
index 385c4d892a650bc18400f4f59cd2ed10bea24d70..c8d47eab2c5191e941d188845a927b5202af9738 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
@@ -15,10 +15,13 @@
 from __future__ import print_function
 
 import os
+import sys
 import unittest
-import paddle.fluid as fluid
 
+import paddle.fluid as fluid
 from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_transformer import TestTransformer
 
 flag_name = os.path.splitext(__file__)[0]
 
@@ -38,5 +41,12 @@ class TestParallelDygraphTransformer(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphTransformerSpawn(TestDistSpawnRunner):
+    def test_transformer_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestTransformer, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index b1a25ad3529e8b0a4126bc458838ecd876e5af30..1c05b96f1fc61234028e940f6403ae08a0186027 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -174,66 +174,6 @@ class TestPool1d_API(unittest.TestCase):
             result = max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-    def check_adaptive_max_dygraph_results(self, place):
-        with fluid.dygraph.guard(place):
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            input = fluid.dygraph.to_variable(input_np)
-            result = F.adaptive_max_pool1d(input, output_size=16)
-
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
-                output_size=16)
-            result = ada_max_pool1d_dg(input)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-    def check_adaptive_avg_dygraph_results(self, place):
-        with fluid.dygraph.guard(place):
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            input = fluid.dygraph.to_variable(input_np)
-            result = F.adaptive_avg_pool1d(input, output_size=16)
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
-
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
-                output_size=16)
-            result = ada_max_pool1d_dg(input)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-    def check_adaptive_max_static_results(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
-            result = F.adaptive_max_pool1d(input, output_size=16)
-
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
-
-            exe = fluid.Executor(place)
-            fetches = exe.run(fluid.default_main_program(),
-                              feed={"input": input_np},
-                              fetch_list=[result])
-            self.assertTrue(np.allclose(fetches[0], result_np))
-
-    def check_adaptive_avg_static_results(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
-            result = F.adaptive_avg_pool1d(input, output_size=16)
-
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
-
-            exe = fluid.Executor(place)
-            fetches = exe.run(fluid.default_main_program(),
-                              feed={"input": input_np},
-                              fetch_list=[result])
-            self.assertTrue(np.allclose(fetches[0], result_np))
-
     def check_max_dygraph_padding_same(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
@@ -265,10 +205,6 @@ class TestPool1d_API(unittest.TestCase):
             self.check_avg_dygraph_results(place)
             self.check_max_static_results(place)
             self.check_avg_static_results(place)
-            self.check_adaptive_max_dygraph_results(place)
-            self.check_adaptive_avg_dygraph_results(place)
-            self.check_adaptive_max_static_results(place)
-            self.check_adaptive_avg_static_results(place)
             self.check_max_dygraph_padding_same(place)
             self.check_avg_dygraph_padding_same(place)
 
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index 6045f2d713627cedfe169b9e066222904244311a..32d8f73552f71dd1066e19b70f1f2e4b8628950a 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -147,10 +147,8 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
 
     with fluid.program_guard(fluid.Program(), fluid.Program()):
         with fluid.scope_guard(fluid.core.Scope()):
-            fluid.default_main_program().random_seed = 1
-            fluid.default_startup_program().random_seed = 1
+            gen = paddle.manual_seed(1)
             np.random.seed(1)
-
             img = fluid.layers.data(name='image', shape=[784], dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             loss = simple_fc_net(img, label, use_py_func_op)
@@ -189,17 +187,17 @@ class TestPyFuncOpUseExecutor(unittest.TestCase):
         self.use_parallel_executor = False
 
     def test_loss_diff(self):
-        losses = []
         for use_cuda in [True, False]:
+            losses = []
             for use_py_func_op in [True, False]:
                 L = test_main(use_cuda, use_py_func_op,
                               self.use_parallel_executor)
                 if L is not None:
                     losses.append(L)
 
-        for idx in six.moves.range(len(losses) - 1):
-            max_diff = np.max(np.abs(losses[idx] - losses[0]))
-            self.assertAlmostEqual(max_diff, 0, delta=1e-3)
+                for idx in six.moves.range(len(losses) - 1):
+                    max_diff = np.max(np.abs(losses[idx] - losses[0]))
+                    self.assertAlmostEqual(max_diff, 0, delta=1e-3)
 
 
 class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor):
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
index 2933abe46c1b87959c9f61975c02a41e91dfbef3..343508bf619b6a7328016464794221f7284435b2 100644
--- a/python/paddle/fluid/tests/unittests/test_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -26,27 +26,31 @@ import paddle.fluid.core as core
 
 
 class TestGeneratorSeed(unittest.TestCase):
-    """
-    Test cases for cpu generator seed.
-    """
+    #     """
+    #     Test cases for cpu generator seed.
+    #     """
 
     def test_generator_uniform_random_dygraph(self):
         """Test Generator seed."""
-        gen = generator.Generator()
 
         fluid.enable_dygraph()
 
-        gen.manual_seed(12312321111)
+        gen = paddle.manual_seed(12312321111)
         x = fluid.layers.uniform_random([10], dtype="float32", min=0.0, max=1.0)
+
         st1 = gen.get_state()
         x1 = fluid.layers.uniform_random(
             [10], dtype="float32", min=0.0, max=1.0)
+
         gen.set_state(st1)
+        print(gen.get_state())
         x2 = fluid.layers.uniform_random(
             [10], dtype="float32", min=0.0, max=1.0)
-        gen.manual_seed(12312321111)
+
+        paddle.manual_seed(12312321111)
         x3 = fluid.layers.uniform_random(
             [10], dtype="float32", min=0.0, max=1.0)
+
         x_np = x.numpy()
         x1_np = x1.numpy()
         x2_np = x2.numpy()
@@ -57,11 +61,9 @@ class TestGeneratorSeed(unittest.TestCase):
             self.assertTrue(np.allclose(x_np, x3_np))
 
     def test_generator_uniform_random_static(self):
-
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        gen = paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -93,11 +95,9 @@ class TestGeneratorSeed(unittest.TestCase):
                 self.assertTrue(not np.allclose(out1_res2, out1_res1))
 
     def test_gen_dropout_dygraph(self):
-        gen = generator.Generator()
-
         fluid.enable_dygraph()
 
-        gen.manual_seed(111111111)
+        gen = paddle.manual_seed(111111111)
         st = gen.get_state()
         # x = np.arange(1,101).reshape(2,50).astype("float32")
         x = fluid.layers.uniform_random(
@@ -110,8 +110,7 @@ class TestGeneratorSeed(unittest.TestCase):
         y1 = fluid.layers.dropout(x1, 0.5)
         y_np = y.numpy()
         y1_np = y1.numpy()
-        #print(y_np)
-        #print(y1_np)
+
         if not core.is_compiled_with_cuda():
             print(">>>>>>> dropout dygraph >>>>>>>")
             self.assertTrue(np.allclose(y_np, y1_np))
@@ -119,8 +118,7 @@ class TestGeneratorSeed(unittest.TestCase):
     def test_gen_dropout_static(self):
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        gen = paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -137,19 +135,16 @@ class TestGeneratorSeed(unittest.TestCase):
             out2 = exe.run(train_program, feed={}, fetch_list=[y_1])
         out1_np = np.array(out1[0])
         out2_np = np.array(out2[0])
-        # print(out1_np)
-        # print(out2_np)
+
         if not core.is_compiled_with_cuda():
             print(">>>>>>> dropout static >>>>>>>")
             self.assertTrue(np.allclose(out1_np, out2_np))
 
     def test_generator_gaussian_random_dygraph(self):
         """Test Generator seed."""
-        gen = generator.Generator()
-
         fluid.enable_dygraph()
 
-        gen.manual_seed(12312321111)
+        gen = paddle.manual_seed(12312321111)
         x = fluid.layers.gaussian_random([10], dtype="float32")
         st1 = gen.get_state()
         x1 = fluid.layers.gaussian_random([10], dtype="float32")
@@ -168,11 +163,9 @@ class TestGeneratorSeed(unittest.TestCase):
             self.assertTrue(np.allclose(x_np, x3_np))
 
     def test_generator_gaussian_random_static(self):
-
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        gen = paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -210,7 +203,7 @@ class TestGeneratorSeed(unittest.TestCase):
 
         fluid.enable_dygraph()
 
-        gen.manual_seed(12312321111)
+        gen = paddle.manual_seed(12312321111)
         x = paddle.randint(low=10, shape=[10], dtype="int32")
         st1 = gen.get_state()
         x1 = paddle.randint(low=10, shape=[10], dtype="int32")
@@ -228,12 +221,64 @@ class TestGeneratorSeed(unittest.TestCase):
             self.assertTrue(np.allclose(x1_np, x2_np))
             self.assertTrue(np.allclose(x_np, x3_np))
 
-    def test_generator_ranint_static(self):
+    def test_generator_uniform_random_static(self):
+        fluid.disable_dygraph()
+
+        gen = paddle.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = fluid.layers.uniform_random(shape=[3, 4])
+            result_2 = fluid.layers.uniform_random(shape=[3, 4])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
 
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_generator_randint_dygraph(self):
+        """Test Generator seed."""
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(12312321111)
+        x = paddle.randint(low=1)
+        st1 = gen.get_state()
+        x1 = paddle.randint(low=1)
+        gen.set_state(st1)
+        x2 = paddle.randint(low=1)
+        gen.manual_seed(12312321111)
+        x3 = paddle.randint(low=1)
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+        if not core.is_compiled_with_cuda():
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_ranint_static(self):
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        gen = paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -267,11 +312,10 @@ class TestGeneratorSeed(unittest.TestCase):
 
     def test_generator_randperm_dygraph(self):
         """Test Generator seed."""
-        gen = generator.Generator()
 
         fluid.enable_dygraph()
 
-        gen.manual_seed(12312321111)
+        gen = paddle.manual_seed(12312321111)
         x = paddle.randperm(10)
         st1 = gen.get_state()
         x1 = paddle.randperm(10)
@@ -284,9 +328,6 @@ class TestGeneratorSeed(unittest.TestCase):
         x2_np = x2.numpy()
         x3_np = x3.numpy()
 
-        # print("## {}".format(x1_np))
-        # print("## {}".format(x2_np))
-
         if not core.is_compiled_with_cuda():
             print(">>>>>>> randperm dygraph >>>>>>>")
             self.assertTrue(np.allclose(x1_np, x2_np))
@@ -296,8 +337,7 @@ class TestGeneratorSeed(unittest.TestCase):
 
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -312,8 +352,8 @@ class TestGeneratorSeed(unittest.TestCase):
             out1 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
-            #gen.set_state(cur_state)
-            gen.manual_seed(123123143)
+
+            paddle.manual_seed(123123143)
             out2 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
@@ -331,7 +371,7 @@ class TestGeneratorSeed(unittest.TestCase):
 
     def test_generator_sampling_id_dygraph(self):
         """Test Generator seed."""
-        gen = generator.Generator()
+        gen = paddle.manual_seed(12312321111)
 
         fluid.enable_dygraph()
 
@@ -339,14 +379,17 @@ class TestGeneratorSeed(unittest.TestCase):
         x = fluid.layers.uniform_random(
             [10, 10], dtype="float32", min=0.0, max=1.0)
         y = fluid.layers.sampling_id(x)
+
         st1 = gen.get_state()
         x1 = fluid.layers.uniform_random(
             [10, 10], dtype="float32", min=0.0, max=1.0)
         y1 = fluid.layers.sampling_id(x)
+
         gen.set_state(st1)
         x2 = fluid.layers.uniform_random(
             [10, 10], dtype="float32", min=0.0, max=1.0)
         y2 = fluid.layers.sampling_id(x)
+
         gen.manual_seed(12312321111)
         x3 = fluid.layers.uniform_random(
             [10, 10], dtype="float32", min=0.0, max=1.0)
@@ -357,9 +400,6 @@ class TestGeneratorSeed(unittest.TestCase):
         x2_np = y2.numpy()
         x3_np = y3.numpy()
 
-        print("## {}".format(x1_np))
-        print("## {}".format(x2_np))
-
         if not core.is_compiled_with_cuda():
             print(">>>>>>> sampling id dygraph >>>>>>>")
             self.assertTrue(np.allclose(x1_np, x2_np))
@@ -369,8 +409,7 @@ class TestGeneratorSeed(unittest.TestCase):
 
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -386,8 +425,8 @@ class TestGeneratorSeed(unittest.TestCase):
             out1 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
-            #gen.set_state(cur_state)
-            gen.manual_seed(123123143)
+
+            paddle.manual_seed(123123143)
             out2 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
@@ -406,8 +445,7 @@ class TestGeneratorSeed(unittest.TestCase):
     def test_gen_TruncatedNormal_initializer(self):
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        gen = paddle.manual_seed(123123143)
         cur_state = gen.get_state()
 
         startup_program = fluid.Program()
@@ -432,9 +470,7 @@ class TestGeneratorSeed(unittest.TestCase):
             out1 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
-            #gen.set_state(cur_state)
 
-        #gen.set_state(cur_state)    
         gen.manual_seed(123123143)
         with fluid.program_guard(train_program, startup_program):
             exe.run(startup_program)
@@ -447,11 +483,6 @@ class TestGeneratorSeed(unittest.TestCase):
         out2_res1 = np.array(out2[0])
         out2_res2 = np.array(out2[1])
 
-        print(out1_res1)
-        print(out1_res2)
-        print(out2_res1)
-        print(out2_res2)
-
         if not core.is_compiled_with_cuda():
             print(">>>>>>> sampling id static >>>>>>>")
             self.assertTrue(np.allclose(out1_res1, out2_res1))
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 58b407f8bc1f41301a068f0b85f4c4e9860a45ff..44087c5421a5ee66273ef35b935926d42dcc37ae 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -169,9 +169,10 @@ class TestRegularizer(unittest.TestCase):
         return param_sum
 
     def check_l2decay_regularizer(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
         with self.scope_prog_guard(
                 main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
@@ -188,9 +189,11 @@ class TestRegularizer(unittest.TestCase):
         return param_sum
 
     def check_l2decay(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
+
         with self.scope_prog_guard(
                 main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
@@ -243,7 +246,8 @@ class TestRegularizer(unittest.TestCase):
         with fluid.dygraph.guard():
             input = fluid.dygraph.to_variable(
                 np.random.randn(3, 5).astype('float32'))
-            fluid.default_main_program().random_seed = 1
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
 
             linear1 = fluid.dygraph.Linear(
                 5, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index fb3fc8735566fcf601a7cb507e3826dd92a5651e..2c87e06e893a4d6495ad81ac3dcdf375a41272fb 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -20,6 +20,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from op_test import OpTest
+import paddle
 
 
 class TestSGDOp(OpTest):
@@ -208,5 +209,46 @@ class TestSGDOpWithLargeInput(unittest.TestCase):
         result = exe.run(compiled_prog, fetch_list=[avg_cost])
 
 
+class TestSGDV2(unittest.TestCase):
+    def test_sgd_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.SGD(learning_rate=0.01,
+                                    parameters=linear.parameters(),
+                                    weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_sgd(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(ValueError, paddle.optimizer.SGD, learning_rate=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca92bc75245cebbfdfbbed80e99957d2b4f57b2a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
@@ -0,0 +1,87 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import unittest
+
+import paddle
+import paddle.distributed as dist
+from paddle.distributed.spawn import _get_subprocess_env_list
+
+from paddle.fluid import core
+from paddle.fluid.dygraph import parallel_helper
+
+# NOTE(chenweihang): Coverage CI is currently not able to count python3
+# unittest, so the unittests here covers some cases that will only be 
+# executed in the python3 sub-process. 
+
+
+class TestInitParallelEnv(unittest.TestCase):
+    def test_beckend_type_error(self):
+        with self.assertRaises(TypeError):
+            dist.init_parallel_env(backend=1)
+
+    def test_backend_value_error(self):
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env(backend="mpi")
+
+    def test_check_env_failed(self):
+        os.environ['FLAGS_selected_gpus'] = '0'
+        os.environ['PADDLE_TRAINER_ID'] = '0'
+        os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170'
+        os.environ['PADDLE_TRAINERS_NUM'] = '1'
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env()
+
+    def test_init_parallel_env_break(self):
+        os.environ['FLAGS_selected_gpus'] = '0'
+        os.environ['PADDLE_TRAINER_ID'] = '0'
+        os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170'
+        os.environ['PADDLE_TRAINERS_NUM'] = '1'
+        os.environ['PADDLE_TRAINER_ENDPOINTS'] = '127.0.0.1:6170'
+        # coverage success branch
+        dist.init_parallel_env()
+        self.assertFalse(parallel_helper._is_parallel_ctx_initialized())
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSpawnAssistMethod(unittest.TestCase):
+    def test_only_cluster_node_ips_error(self):
+        with self.assertRaises(ValueError):
+            options = dict()
+            options['cluster_node_ips'] = "127.0.0.1,127.0.0.2"
+            _get_subprocess_env_list(nprocs=1, options=options)
+
+    def test_nprocs_greater_than_device_num_error(self):
+        with self.assertRaises(RuntimeError):
+            _get_subprocess_env_list(nprocs=100, options=dict())
+
+    def test_selected_gpus_error(self):
+        with self.assertRaises(ValueError):
+            options = dict()
+            options['selected_gpus'] = "100,101"
+            _get_subprocess_env_list(nprocs=2, options=options)
+
+    def test_get_correct_env(self):
+        env_dict = _get_subprocess_env_list(nprocs=1, options=dict())[0]
+        self.assertEqual(env_dict['PADDLE_TRAINER_ID'], '0')
+        self.assertEqual(env_dict['PADDLE_TRAINERS_NUM'], '1')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index b0701a9b187f6c7cf63f43d69f482ea13e6d3fe3..09cd40d9cc59914c82cc343bb78b72fbc2b29e59 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -221,5 +221,21 @@ class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
             self.assertRaises(TypeError, my_sync_batch_norm, x2)
 
 
+class TestConvertSyncBatchNorm(unittest.TestCase):
+    def test_convert(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        with program_guard(Program(), Program()):
+            model = paddle.nn.Sequential(
+                paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
+            sync_model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            for idx, sublayer in enumerate(model.sublayers()):
+                if isinstance(sublayer, paddle.nn.BatchNorm2d):
+                    self.assertEqual(
+                        isinstance(sync_model[idx], paddle.nn.SyncBatchNorm),
+                        True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index c8d1e77134036bf7b28d4afb8bacaa44092b1053..5fea9f69a18c83be0f6af05784735ea53d0993d2 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -211,7 +211,8 @@ def ffn(src, encoder_layer, ffn_fc1_act="relu"):
 class TestTransformer(unittest.TestCase):
     def test_multi_head_attention(self):
         def multihead_attention_test_helper(self_attention, cache):
-            paddle.framework.manual_seed(2020)
+            paddle.manual_seed(2020)
+            paddle.framework.random._manual_program_seed(2020)
             # self_attention|cross_attention, cache|No cache
             with fluid.dygraph.guard(fluid.CPUPlace()):
 
@@ -275,6 +276,7 @@ class TestTransformer(unittest.TestCase):
 
         with fluid.dygraph.guard(fluid.CPUPlace()):
             paddle.framework.manual_seed(2020)
+            paddle.framework.random._manual_program_seed(2020)
 
             ffn_fc1_act = "relu"
             # 1.generate basic params
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 158462a1e6e1012b7473a2410f2c003d04ea2e40..a04aaaef0d41b9f991889586b489269b6ede5b42 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -21,6 +21,7 @@ import numpy as np
 from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
+import paddle
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
@@ -234,16 +235,16 @@ class TestUniformRandomOpSelectedRows(unittest.TestCase):
     def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
-
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[4, 784],
+            shape=[100, 784],
             min=-5.0,
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -255,19 +256,19 @@ class TestUniformRandomOpSelectedRowsWithDiagInit(
     def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
-
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[4, 784],
+            shape=[100, 784],
             min=-5.0,
             max=10.0,
             seed=10,
-            diag_num=4,
+            diag_num=100,
             diag_step=784,
             diag_val=1.0)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist_diag(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -276,6 +277,7 @@ class TestUniformRandomOpSelectedRowsWithDiagInit(
 
 class TestUniformRandomOpApi(unittest.TestCase):
     def test_api(self):
+        paddle.manual_seed(10)
         x = fluid.layers.data('x', shape=[16], dtype='float32', lod_level=1)
         y = fluid.layers.fc(x,
                             size=16,
@@ -347,12 +349,15 @@ class TestUniformRandomOp_attr_tensor_API(unittest.TestCase):
 
 class TestUniformRandomOp_API_seed(unittest.TestCase):
     def test_attr_tensor_API(self):
+        _seed = 10
+        gen = paddle.manual_seed(_seed)
+        gen._is_init_py = False
         startup_program = fluid.Program()
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
             _min = 5
             _max = 10
-            _seed = 10
+
             ret = fluid.layers.nn.uniform_random(
                 [2, 3, 2], min=_min, max=_max, seed=_seed)
             ret_2 = fluid.layers.nn.uniform_random(
@@ -386,8 +391,8 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_tensor = scope.var("Shape").get_tensor()
-        shape_tensor.set(np.array([4, 784]).astype("int64"), place)
-
+        shape_tensor.set(np.array([100, 784]).astype("int64"), place)
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             ShapeTensor="Shape",
@@ -396,7 +401,7 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -418,10 +423,10 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_1 = scope.var("shape1").get_tensor()
-        shape_1.set(np.array([4]).astype("int64"), place)
+        shape_1.set(np.array([100]).astype("int64"), place)
         shape_2 = scope.var("shape2").get_tensor()
         shape_2.set(np.array([784]).astype("int64"), place)
-
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             ShapeTensorList=["shape1", "shape2"],
@@ -430,7 +435,7 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -455,21 +460,21 @@ class TestUniformRandomBatchSizeLikeOpError(unittest.TestCase):
 
             def test_Variable():
                 x1 = fluid.create_lod_tensor(
-                    np.zeros((4, 784)), [[1, 1, 1, 1]], fluid.CPUPlace())
+                    np.zeros((100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace())
                 fluid.layers.uniform_random_batch_size_like(x1)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_shape():
                 x1 = fluid.layers.data(
-                    name='x2', shape=[4, 784], dtype='float32')
+                    name='x2', shape=[100, 784], dtype='float32')
                 fluid.layers.uniform_random_batch_size_like(x1, shape="shape")
 
             self.assertRaises(TypeError, test_shape)
 
             def test_dtype():
                 x2 = fluid.layers.data(
-                    name='x2', shape=[4, 784], dtype='float32')
+                    name='x2', shape=[100, 784], dtype='float32')
                 fluid.layers.uniform_random_batch_size_like(x2, 'int32')
 
             self.assertRaises(TypeError, test_dtype)
@@ -495,20 +500,20 @@ class TestUniformOpError(unittest.TestCase):
 
             def test_Variable():
                 x1 = fluid.create_lod_tensor(
-                    np.zeros((4, 784)), [[1, 1, 1, 1]], fluid.CPUPlace())
+                    np.zeros((100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace())
                 paddle.tensor.random.uniform(x1)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_Variable2():
-                x1 = np.zeros((4, 784))
+                x1 = np.zeros((100, 784))
                 paddle.tensor.random.uniform(x1)
 
             self.assertRaises(TypeError, test_Variable2)
 
             def test_dtype():
                 x2 = fluid.layers.data(
-                    name='x2', shape=[4, 784], dtype='float32')
+                    name='x2', shape=[100, 784], dtype='float32')
                 paddle.tensor.random.uniform(x2, 'int32')
 
             self.assertRaises(TypeError, test_dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 80b94704c388824901312b5d577cb5cfd0d0c75b..c8383bb950d3ed7b2cdfafa185b0ad156bf7c7bf 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -32,6 +32,30 @@ class TestVarBase(unittest.TestCase):
     def test_to_tensor(self):
         def _test_place(place):
             with fluid.dygraph.guard():
+                paddle.set_default_dtype('float32')
+                x = paddle.to_tensor(1, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1]))
+                self.assertNotEqual(x.dtype, core.VarDesc.VarType.FP32)
+
+                x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
+                self.assertTrue(
+                    np.array_equal(x.numpy(), np.array([1.2]).astype(
+                        'float32')))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
+
+                x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
+                self.assertEqual(x.dtype, 'complex64')
+
+                paddle.set_default_dtype('float64')
+                x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1.2]))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP64)
+
+                x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
+                self.assertEqual(x.dtype, 'complex128')
+
                 x = paddle.to_tensor(
                     1, dtype='float32', place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1.]))
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index 207ff66a0f877598989e47a8632aa783b53bcc67..ee01bfb21f8206133aa55e8962b4bcc46233085e 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid.layers as layers
 from paddle.fluid.executor import Executor
 import paddle.fluid.core as core
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 95a0cb52046790e44150dd6f74733ae86a75a570..b2975283fbef010029b935b9b209411f09bdb5fd 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -50,8 +50,6 @@ from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
-from ..fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
-from ..fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 1bb13294805efca319546b432354ce5ba6b1e868..2555d24464112ed8446d863dc8e65cfa37680b36 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -14,28 +14,50 @@
 
 # TODO: define random api
 import paddle.fluid as fluid
+from paddle.fluid import core
 
 __all__ = ['manual_seed']
 
 
 def manual_seed(seed):
     """
-	:alias_main: paddle.manual_seed
-	:alias: paddle.manual_seed,paddle.framework.random.manual_seed
 
-    Set global manual seed for program
+    Sets the seed for global default generator, which manages the random number generation.
 
     Args:
-        manual_seed(int): random seed for program
+        seed(int): The random seed to set. It is recommend to set a large int number.
 
     Returns:
-        None.
+        Generator: The global default generator object.
 
     Examples:
         .. code-block:: python
 
-            from paddle.framework import manual_seed
-            manual_seed(102)
+            import paddle
+            gen = paddle.manual_seed(102)
+
+    """
+    #TODO(zhiqiu): 1. remove program.random_seed when all random-related op upgrade
+    # 2. support gpu generator by global device 
+
+    seed = int(seed)
+
+    core.default_cpu_generator()._is_init_py = True
+    return core.default_cpu_generator().manual_seed(seed)
+
+
+def _manual_program_seed(seed):
+    """
+    Sets global seed for generating random numbers.
+  
+    NOTE(zhiqiu): This is the original implemention of manual_seed. Keeps it temporally 
+    since CUDA generator is not developed, so we need it in the unittest.
+
+    Args:
+        seed(int): The random seed to set. It is recommend to set a large int number.
+    
+    Returns:
+        None
     """
     fluid.default_main_program().random_seed = seed
     fluid.default_startup_program().random_seed = seed
diff --git a/python/paddle/incubate/hapi/model.py b/python/paddle/incubate/hapi/model.py
index e4a6b03f7aa5c7f537dc476d8f80162e530d5dbe..b52354d4ccf4671b0d372bae63a1befbe383e053 100644
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/incubate/hapi/model.py
@@ -891,33 +891,31 @@ class Model(object):
 
                 class Mnist(paddle.nn.Layer):
                     def __init__(self):
-                        super(MyNet, self).__init__()
-                        self._fc = Linear(784, 1, act='softmax')
+                        super(Mnist, self).__init__()
+                        self._fc = Linear(784, 10, act='softmax')
 
-                  @paddle.jit.to_static # If save for inference in dygraph, need this
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
+                    # If save for inference in dygraph, need this
+                    @paddle.jit.to_static
+                    def forward(self, x):
+                        y = self._fc(x)
+                        return y
 
-                dynamic = True # False
+                dynamic = True  # False
                 device = hapi.set_device('cpu')
                 # if use static graph, do not set
                 paddle.disable_static(device) if dynamic else None
-
                 # inputs and labels are not required for dynamic graph.
                 input = hapi.Input([None, 784], 'float32', 'x')
                 label = hapi.Input([None, 1], 'int64', 'label')
-
                 model = hapi.Model(Mnist(), input, label)
                 optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                    parameter_list=model.parameters())
-                model.prepare(optim,
-                                paddle.nn.CrossEntropyLoss(),
-                                hapi.metrics.Accuracy())
+                                            parameter_list=model.parameters())
+                model.prepare(optim, paddle.nn.CrossEntropyLoss())
                 mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
                 model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
-                model.save('checkpoint/test') # save for training
-                model.save('inference_model', False) # save for inference
+                model.save('checkpoint/test')  # save for training
+                model.save('inference_model', False)  # save for inference
+
         """
 
         if ParallelEnv().local_rank == 0:
@@ -1534,47 +1532,6 @@ class Model(object):
 
         Returns:
             list: The fetch variables' name list
-
-        Examples:
-        .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.static import InputSpec
-
-            import paddle.incubate.hapi as hapi
-            from paddle.nn import Linear
-            from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
-
-            class Mnist(Layer):
-                def __init__(self, classifier_act=None):
-                    super(Mnist, self).__init__()
-
-                    self.fc = Linear(input_dim=784, output_dim=10, act="softmax")
-
-                @paddle.jit.to_static # In static mode, you need to delete this.
-                def forward(self, inputs):
-                    outputs = self.fc(inputs)
-                    return outputs
-
-            dynamic = True # False
-            device = hapi.set_device('gpu')
-
-            # if use static graph, do not set
-            paddle.disable_static(device) if dynamic else None
-
-            # inputs and labels are not required for dynamic graph.
-            input = InputSpec([None, 784], 'float32', 'x')
-            label = InputSpec([None, 1], 'int64', 'label')
-
-            model = hapi.Model(Mnist(), input, label)
-            optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                parameter_list=model.parameters())
-            model.prepare(optim,
-                            paddle.nn.CrossEntropyLoss(),
-                            hapi.metrics.Accuracy())
-            mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
-            model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
-            model.save_inference_model('inference_model')
         """
 
         def get_inout_spec(all_vars, return_name=False):
@@ -1592,65 +1549,66 @@ class Model(object):
         #    the inputs of the model in running.
         # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
         if fluid.in_dygraph_mode():
-            layer = self.network
-            fluid.disable_dygraph()
-
-            # 1. input check
-            prog_translator = ProgramTranslator()
-            if not prog_translator.enable_declarative:
-                raise RuntimeError(
-                    "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
-                )
-            if not isinstance(layer, Layer):
-                raise TypeError(
-                    "The input layer should be 'Layer', but received layer type is %s."
-                    % type(layer))
-
-            # 2. get program of declarative Layer.forward
-            concrete_program = layer.forward.concrete_program
-
-            # NOTE: we maintain the mapping of variable name to
-            # structured name, the buffer variable (non-persistable)
-            # saved to inference program may not need by dygraph Layer,
-            # we only record the state_dict variable's structured name
-            state_names_dict = dict()
-            for structured_name, var in layer.state_dict().items():
-                state_names_dict[var.name] = structured_name
-
-            # 3. share parameters from Layer to scope & record var info
-            scope = core.Scope()
-            extra_var_info = dict()
-            for param_or_buffer in concrete_program.parameters:
-                # share to scope
-                param_or_buffer_tensor = scope.var(
-                    param_or_buffer.name).get_tensor()
-                src_tensor = param_or_buffer.value().get_tensor()
-                param_or_buffer_tensor._share_data_with(src_tensor)
-                # record var info
-                extra_info_dict = dict()
-                if param_or_buffer.name in state_names_dict:
-                    extra_info_dict['structured_name'] = state_names_dict[
-                        param_or_buffer.name]
-                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
-                if isinstance(param_or_buffer, ParamBase):
-                    extra_info_dict['trainable'] = param_or_buffer.trainable
-                extra_var_info[param_or_buffer.name] = extra_info_dict
-
-            # 4. build input & output spec
-            input_var_names = get_inout_spec(concrete_program.inputs, True)
-            output_vars = get_inout_spec(concrete_program.outputs)
-
-            # 5. save inference model
-            with scope_guard(scope):
-                return fluid.io.save_inference_model(
-                    dirname=save_dir,
-                    feeded_var_names=input_var_names,
-                    target_vars=output_vars,
-                    executor=Executor(_current_expected_place()),
-                    main_program=concrete_program.main_program.clone(),
-                    model_filename=model_filename,
-                    params_filename=params_filename,
-                    program_only=model_only)
+            with fluid.framework._dygraph_guard(None):
+                layer = self.network
+
+                # 1. input check
+                prog_translator = ProgramTranslator()
+                if not prog_translator.enable_declarative:
+                    raise RuntimeError(
+                        "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
+                    )
+                if not isinstance(layer, Layer):
+                    raise TypeError(
+                        "The input layer should be 'Layer', but received layer type is %s."
+                        % type(layer))
+
+                # 2. get program of declarative Layer.forward
+                concrete_program = layer.forward.concrete_program
+
+                # NOTE: we maintain the mapping of variable name to
+                # structured name, the buffer variable (non-persistable)
+                # saved to inference program may not need by dygraph Layer,
+                # we only record the state_dict variable's structured name
+                state_names_dict = dict()
+                for structured_name, var in layer.state_dict().items():
+                    state_names_dict[var.name] = structured_name
+
+                # 3. share parameters from Layer to scope & record var info
+                scope = core.Scope()
+                extra_var_info = dict()
+                for param_or_buffer in concrete_program.parameters:
+                    # share to scope
+                    param_or_buffer_tensor = scope.var(
+                        param_or_buffer.name).get_tensor()
+                    src_tensor = param_or_buffer.value().get_tensor()
+                    param_or_buffer_tensor._share_data_with(src_tensor)
+                    # record var info
+                    extra_info_dict = dict()
+                    if param_or_buffer.name in state_names_dict:
+                        extra_info_dict['structured_name'] = state_names_dict[
+                            param_or_buffer.name]
+                    extra_info_dict[
+                        'stop_gradient'] = param_or_buffer.stop_gradient
+                    if isinstance(param_or_buffer, ParamBase):
+                        extra_info_dict['trainable'] = param_or_buffer.trainable
+                    extra_var_info[param_or_buffer.name] = extra_info_dict
+
+                # 4. build input & output spec
+                input_var_names = get_inout_spec(concrete_program.inputs, True)
+                output_vars = get_inout_spec(concrete_program.outputs)
+
+                # 5. save inference model
+                with scope_guard(scope):
+                    return fluid.io.save_inference_model(
+                        dirname=save_dir,
+                        feeded_var_names=input_var_names,
+                        target_vars=output_vars,
+                        executor=Executor(_current_expected_place()),
+                        main_program=concrete_program.main_program.clone(),
+                        model_filename=model_filename,
+                        params_filename=params_filename,
+                        program_only=model_only)
 
         else:
             prog = self._adapter._progs.get('test', None)
diff --git a/python/paddle/incubate/hapi/tests/test_model.py b/python/paddle/incubate/hapi/tests/test_model.py
index 8e0c051ee8c39c032dcc05afa466b493e1498a86..7fc471aa1e2eeb80ae81d4a32b09eeff74193e6f 100644
--- a/python/paddle/incubate/hapi/tests/test_model.py
+++ b/python/paddle/incubate/hapi/tests/test_model.py
@@ -22,6 +22,7 @@ import numpy as np
 import shutil
 import tempfile
 
+import paddle
 from paddle import fluid
 from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
 from paddle.fluid.dygraph.base import to_variable
@@ -170,8 +171,8 @@ class TestModel(unittest.TestCase):
             cls.test_dataset, places=cls.device, batch_size=64)
 
         seed = 333
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
 
         dy_lenet = LeNetDygraph()
         cls.init_param = dy_lenet.state_dict()
@@ -222,8 +223,8 @@ class TestModel(unittest.TestCase):
     def fit(self, dynamic, num_replicas=None, rank=None):
         fluid.enable_dygraph(self.device) if dynamic else None
         seed = 333
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
 
         net = LeNet(classifier_activation=None)
         optim_new = fluid.optimizer.Adam(
@@ -327,8 +328,8 @@ class MyModel(fluid.dygraph.Layer):
 
 class TestModelFunction(unittest.TestCase):
     def set_seed(self, seed=1024):
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
 
     def test_train_batch(self, dynamic=True):
         dim = 20
diff --git a/python/paddle/incubate/hapi/tests/test_text.py b/python/paddle/incubate/hapi/tests/test_text.py
index bdc637997b0cbd8389fdfab9f71597c62b0e21a3..c4fef0d749ce788e50d8cffdf9b7041e33d078af 100644
--- a/python/paddle/incubate/hapi/tests/test_text.py
+++ b/python/paddle/incubate/hapi/tests/test_text.py
@@ -20,6 +20,7 @@ import random
 
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Embedding, Linear, Layer
 from paddle.fluid.layers import BeamSearchDecoder
@@ -87,15 +88,18 @@ class ModuleApiTest(unittest.TestCase):
             fluid.enable_dygraph(place)
         else:
             fluid.disable_dygraph()
-        fluid.default_main_program().random_seed = self._random_seed
-        fluid.default_startup_program().random_seed = self._random_seed
-        layer = self.model_cls(**self.attrs) if isinstance(
-            self.attrs, dict) else self.model_cls(*self.attrs)
-        model = Model(layer, inputs=self.make_inputs())
-        model.prepare()
-        if self.param_states:
-            model.load(self.param_states, optim_state=None)
-        return model.test_batch(self.inputs)
+        gen = paddle.manual_seed(self._random_seed)
+        gen._is_init_py = False
+        paddle.framework.random._manual_program_seed(self._random_seed)
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            layer = self.model_cls(**self.attrs) if isinstance(
+                self.attrs, dict) else self.model_cls(*self.attrs)
+            model = Model(layer, inputs=self.make_inputs())
+            model.prepare()
+            if self.param_states:
+                model.load(self.param_states, optim_state=None)
+            return model.test_batch(self.inputs)
 
     def check_output_with_place(self, place, mode="test"):
         dygraph_output = self._calc_output(place, mode, dygraph=True)
@@ -129,12 +133,9 @@ class TestBasicLSTM(ModuleApiTest):
 
     @staticmethod
     def model_init(model, input_size, hidden_size):
-        model.lstm = RNN(
-            BasicLSTMCell(
-                input_size,
-                hidden_size,
-                param_attr=fluid.ParamAttr(name="lstm_weight"),
-                bias_attr=fluid.ParamAttr(name="lstm_bias")))
+        model.lstm = RNN(BasicLSTMCell(
+            input_size,
+            hidden_size, ))
 
     @staticmethod
     def model_forward(model, inputs):
diff --git a/python/paddle/incubate/hapi/tests/test_transforms.py b/python/paddle/incubate/hapi/tests/test_transforms.py
index 087f2d1615fc916d23464c1c4387b8f6befe6ac8..84208fda1e947f343de52a0a3c8de68322672013 100644
--- a/python/paddle/incubate/hapi/tests/test_transforms.py
+++ b/python/paddle/incubate/hapi/tests/test_transforms.py
@@ -64,6 +64,11 @@ class TestTransforms(unittest.TestCase):
 
         self.do_transform(trans)
 
+    def test_normalize(self):
+        normalize = transforms.Normalize(mean=0.5, std=0.5)
+        trans = transforms.Compose([transforms.Permute(mode='CHW'), normalize])
+        self.do_transform(trans)
+
     def test_trans_resize(self):
         trans = transforms.Compose([
             transforms.Resize(300, [0, 1]),
@@ -165,7 +170,7 @@ class TestTransforms(unittest.TestCase):
         fake_img = np.random.rand(500, 400, 3).astype('float32')
         fake_img_gray = trans_gray(fake_img)
 
-        np.testing.assert_equal(len(fake_img_gray.shape), 2)
+        np.testing.assert_equal(len(fake_img_gray.shape), 3)
         np.testing.assert_equal(fake_img_gray.shape[0], 500)
         np.testing.assert_equal(fake_img_gray.shape[1], 400)
 
diff --git a/python/paddle/incubate/hapi/vision/transforms/functional.py b/python/paddle/incubate/hapi/vision/transforms/functional.py
index f76aa6be8b4ddaf8b57278b32cf11d145350d772..b118ee3fc7553dc7d02028ae273be33166829635 100644
--- a/python/paddle/incubate/hapi/vision/transforms/functional.py
+++ b/python/paddle/incubate/hapi/vision/transforms/functional.py
@@ -16,6 +16,7 @@ import sys
 import collections
 import random
 import math
+import functools
 
 import cv2
 import numbers
@@ -31,6 +32,23 @@ else:
 __all__ = ['flip', 'resize', 'pad', 'rotate', 'to_grayscale']
 
 
+def keepdims(func):
+    """Keep the dimension of input images unchanged"""
+
+    @functools.wraps(func)
+    def wrapper(image, *args, **kwargs):
+        if len(image.shape) != 3:
+            raise ValueError("Expect image have 3 dims, but got {} dims".format(
+                len(image.shape)))
+        ret = func(image, *args, **kwargs)
+        if len(ret.shape) == 2:
+            ret = ret[:, :, np.newaxis]
+        return ret
+
+    return wrapper
+
+
+@keepdims
 def flip(image, code):
     """
     Accordding to the code (the type of flip), flip the input image
@@ -62,6 +80,7 @@ def flip(image, code):
     return cv2.flip(image, flipCode=code)
 
 
+@keepdims
 def resize(img, size, interpolation=cv2.INTER_LINEAR):
     """
     resize the input data to given size
@@ -103,6 +122,7 @@ def resize(img, size, interpolation=cv2.INTER_LINEAR):
         return cv2.resize(img, size[::-1], interpolation=interpolation)
 
 
+@keepdims
 def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
     """Pads the given CV Image on all sides with speficified padding mode and fill value.
 
@@ -193,6 +213,7 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
     return img
 
 
+@keepdims
 def rotate(img,
            angle,
            interpolation=cv2.INTER_LINEAR,
@@ -266,6 +287,7 @@ def rotate(img,
     return dst.astype(dtype)
 
 
+@keepdims
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
diff --git a/python/paddle/incubate/hapi/vision/transforms/transforms.py b/python/paddle/incubate/hapi/vision/transforms/transforms.py
index 90c6e279959b2133e5cc1184b981723b34c0b750..d46faa0685aa13790be217e0c99ab407790dd2ca 100644
--- a/python/paddle/incubate/hapi/vision/transforms/transforms.py
+++ b/python/paddle/incubate/hapi/vision/transforms/transforms.py
@@ -505,7 +505,7 @@ class Normalize(object):
             mean = [mean, mean, mean]
 
         if isinstance(std, numbers.Number):
-            mean = [std, std, std]
+            std = [std, std, std]
 
         self.mean = np.array(mean, dtype=np.float32).reshape(len(mean), 1, 1)
         self.std = np.array(std, dtype=np.float32).reshape(len(std), 1, 1)
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 76063458d44de3000ad7c1af08376c07e0209c27..5cc9f6d32f9d7ef3dafd73badd0ea88bed372968 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -97,8 +97,20 @@ from .layer.common import Dropout  #DEFINE_ALIAS
 from .layer.common import Dropout2D  #DEFINE_ALIAS
 from .layer.common import Dropout3D  #DEFINE_ALIAS
 from .layer.common import AlphaDropout  #DEFINE_ALIAS
+
+from .layer.pooling import AvgPool1d  #DEFINE_ALIAS
+from .layer.pooling import AvgPool2d  #DEFINE_ALIAS
+from .layer.pooling import AvgPool3d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool1d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool2d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool3d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
 from .layer.pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
 from .layer.pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+
+from .layer.pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveMaxPool2d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveMaxPool3d  #DEFINE_ALIAS
 from .layer.conv import Conv1d  #DEFINE_ALIAS
 from .layer.conv import Conv2d  #DEFINE_ALIAS
 from .layer.conv import Conv3d  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 414e70853eb7163230ab2db987fc19c58e168f19..3c0aa9c5c99e545b657559c30fcde46a69781231 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -170,22 +170,28 @@ from .norm import layer_norm  #DEFINE_ALIAS
 from .norm import lrn  #DEFINE_ALIAS
 from .norm import normalize  #DEFINE_ALIAS
 # from .norm import spectral_norm        #DEFINE_ALIAS
-from .pooling import max_pool1d  #DEFINE_ALIAS
-from .pooling import avg_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
 from .pooling import pool2d  #DEFINE_ALIAS
 from .pooling import pool3d  #DEFINE_ALIAS
+from .pooling import avg_pool1d  #DEFINE_ALIAS
 from .pooling import adaptive_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_pool3d  #DEFINE_ALIAS
-from .rnn import rnn  #DEFINE_ALIAS
-from .rnn import birnn  #DEFINE_ALIAS
 from .pooling import avg_pool2d  #DEFINE_ALIAS
-from .pooling import max_pool2d  #DEFINE_ALIAS
 from .pooling import avg_pool3d  #DEFINE_ALIAS
+from .pooling import max_pool1d  #DEFINE_ALIAS
+from .pooling import max_pool2d  #DEFINE_ALIAS
 from .pooling import max_pool3d  #DEFINE_ALIAS
+
+from .pooling import adaptive_pool2d  #DEFINE_ALIAS
+from .pooling import adaptive_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool2d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
 from .pooling import adaptive_avg_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_avg_pool3d  #DEFINE_ALIAS
+
+from .rnn import rnn  #DEFINE_ALIAS
+from .rnn import birnn  #DEFINE_ALIAS
 # from .rnn import gru_unit        #DEFINE_ALIAS
 # from .rnn import lstm        #DEFINE_ALIAS
 # from .rnn import lstm_unit        #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index f80f200c7163836252faa4b1c932178f6bab0dff..42d7d98aefcbbf51f562b98c4c494aeccfe20cf2 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -158,7 +158,7 @@ def conv1d(x,
         bias (Tensor, optional): The bias with shape [M,]. Default: None.
         stride (int or tuple, optional): The stride size. If stride is a tuple, it must
             contain one integers, (stride_size). Default: 1.
-        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+        padding(int|str|tuple|list, optional): The padding size. Padding could be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means the feature map is zero paded by size of `padding` on both sides.
             3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
@@ -185,7 +185,7 @@ def conv1d(x,
         same with input.
 
     Raises:
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `data_format` is not "NCL" or "NLC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
@@ -238,7 +238,7 @@ def conv1d(x,
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -260,7 +260,7 @@ def conv1d(x,
         padding = padding + [0]
     else:
         raise ValueError(
-            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            "The size of padding's dimension should be 1 or 2. But got padding={}".
             format(padding))
 
     stride = utils.convert_to_list(stride, 1, 'stride') + [1]
@@ -350,7 +350,7 @@ def conv2d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -377,7 +377,7 @@ def conv2d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
             H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
             W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
@@ -424,7 +424,7 @@ def conv2d(x,
 
     Raises:
         ValueError: If `data_format` is not "NCHW" or "NHWC".
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
@@ -465,7 +465,7 @@ def conv2d(x,
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -710,7 +710,7 @@ def conv_transpose1d(x,
 
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -728,7 +728,7 @@ def conv_transpose1d(x,
         padding = padding + [0]
     else:
         raise ValueError(
-            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            "The size of padding's dimension should 1 or 2. But got padding={}".
             format(padding))
 
     stride = utils.convert_to_list(stride, 1, 'stride') + [1]
@@ -807,10 +807,10 @@ def conv_transpose2d(x,
                      stride=1,
                      padding=0,
                      output_padding=0,
-                     groups=1,
                      dilation=1,
-                     data_format='NCHW',
+                     groups=1,
                      output_size=None,
+                     data_format='NCHW',
                      name=None):
     """
 
@@ -829,7 +829,7 @@ def conv_transpose2d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -856,7 +856,7 @@ def conv_transpose2d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
            H^\prime_{out} &= (H_{in} - 1) * strides[0] - pad_height_top - pad_height_bottom + dilations[0] * (H_f - 1) + 1 \\\\
            W^\prime_{out} &= (W_{in} - 1) * strides[1] - pad_width_left - pad_width_right + dilations[1] * (W_f - 1) + 1 \\\\
@@ -883,28 +883,27 @@ def conv_transpose2d(x,
         stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
             If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
-             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
-             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
-             If `padding` is a tuple or list, it could be in three forms:
-             `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and
-            when `data_format` is `'NCHW'`,
-            `padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `'NHWC'`, `padding` can be in the form
+        padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or 
+            'SAME' which is the padding algorithm. If padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or 
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCHW"`, `pool_padding` can be in the form 
+            `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form 
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
-            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups = 1.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
+            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
             tuple, it must contain two integers, (image_height, image_width). None if use
             filter_size, padding, and stride to calculate output_size.
@@ -950,7 +949,7 @@ def conv_transpose2d(x,
           paddle.disable_static()
           x_var = paddle.to_tensor(x)
           w_var = paddle.to_tensor(w)
-          y_var = F.conv2d_transpose(x_var, w_var)
+          y_var = F.conv_transpose2d(x_var, w_var)
           y_np = y_var.numpy()
           print(y_np.shape)
 
@@ -966,7 +965,7 @@ def conv_transpose2d(x,
     channel_dim = -1 if channel_last else 1
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -1070,7 +1069,7 @@ def conv3d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -1096,7 +1095,7 @@ def conv3d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
             D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
             H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
@@ -1147,7 +1146,7 @@ def conv3d(x,
 
     Raises:
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
@@ -1160,20 +1159,18 @@ def conv3d(x,
     Examples:
         .. code-block:: python
 
-            from paddle import fluid
-            import paddle.nn.functional as F
-            import paddle.fluid.dygraph as dg
             import numpy as np
+            import paddle
+            import paddle.nn.functional as F
 
             x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
             w = np.random.randn(6, 3, 3, 3, 3).astype(np.float32)
 
-            place = fluid.CPUPlace()
-            with dg.guard(place):
-                x_var = dg.to_variable(x)
-                w_var = dg.to_variable(w)
-                y_var = F.conv3d(x_var, w_var, act="relu")
-                y_np = y_var.numpy()
+            paddle.disable_static()
+            x_var = paddle.to_tensor(x)
+            w_var = paddle.to_tensor(w)
+            y_var = F.conv3d(x_var, w_var)
+            y_np = y_var.numpy()
             print(y_np.shape)
 
             # (2, 6, 6, 6, 6)
@@ -1190,7 +1187,7 @@ def conv3d(x,
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimmention of the input({}) should be defined. "
+            "The channel dimension of the input({}) should be defined. "
             "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
@@ -1260,8 +1257,8 @@ def conv_transpose3d(x,
                      output_padding=0,
                      groups=1,
                      dilation=1,
-                     data_format='NCDHW',
                      output_size=None,
+                     data_format='NCDHW',
                      name=None):
     """
     The convolution3d transpose layer calculates the output based on the input,
@@ -1279,7 +1276,7 @@ def conv_transpose3d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -1306,7 +1303,7 @@ def conv_transpose3d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
            D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
            H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
@@ -1338,37 +1335,37 @@ def conv_transpose3d(x,
             If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
             stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
             Default: stride = 1.
-        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively
-             adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
-             either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
-             is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `'NCDHW'`, `padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
             `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `'NDHWC'`, `padding` can be in the form
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
-            Default: dilation = 1.
         groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups=1
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
-            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            Default: dilation = 1.
         output_size(int|list|tuple, optional): The output image size. If output size is a
             tuple, it must contain three integers, (image_depth, image_height, image_width). This
             parameter only works when filter_size is None. If output_size and filter_size are 
             specified at the same time, They should follow the formula above. Default: None. 
             Output_size and filter_size should not be None at the same time.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default.
@@ -1425,7 +1422,7 @@ def conv_transpose3d(x,
     num_filters = weight.shape[1]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimmention of the input({}) should be defined. "
+            "The channel dimension of the input({}) should be defined. "
             "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 55bb36d136405385a88b991576c2a9091437d456..f1509143f3c933db12fc4ab6afd1a00b291f38f4 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -784,30 +784,30 @@ def kl_div(input, label, reduction='mean', name=None):
             import numpy as np
             import paddle.nn.functional as F
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
             shape = (5, 20)
             input = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
 
             # 'batchmean' reduction, loss shape will be [N]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='batchmean')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='batchmean')
             # shape=[5]
 
             # 'mean' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='mean')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='mean')
             # shape=[1]
 
             # 'sum' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='sum')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='sum')
             # shape=[1]
 
             # 'none' reduction, loss shape is same with input shape
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='none')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='none')
             # shape=[5, 20]
 
     """
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index ca657b8be3e67c7acb795a0f427ca5fe2c57b1f2..c8790a75901fd5d9a38862158246e3756dc575c4 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -18,124 +18,146 @@ from ...fluid.layers import pool3d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool2d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool3d  #DEFINE_ALIAS
 from ...fluid import core
-from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
-from ...fluid.layers import utils, LayerHelper
-from ...fluid.data_feeder import check_type, check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from ...fluid.layers import unsqueeze, squeeze
+from ...fluid.framework import in_dygraph_mode
+from ...fluid.layers import utils, LayerHelper, unsqueeze, squeeze
+from ...fluid.data_feeder import check_type, check_variable_and_dtype
 
 __all__ = [
     'pool2d',
     'pool3d',
+    'adaptive_pool2d',
+    'adaptive_pool3d',
     'avg_pool1d',
+    'avg_pool2d',
+    'avg_pool3d',
     'max_pool1d',
+    'max_pool2d',
+    'max_pool3d',
     'adaptive_avg_pool1d',
-    'adaptive_max_pool1d',
     'adaptive_avg_pool2d',
     'adaptive_avg_pool3d',
-    'adaptive_pool2d',
-    'adaptive_pool3d',
-    'max_pool2d',
-    'avg_pool2d',
-    'max_pool3d',
-    'avg_pool3d',
+    'adaptive_max_pool1d',
+    'adaptive_max_pool2d',
+    'adaptive_max_pool3d',
 ]
 
 
-def check_input(x, dimension):
+def _is_list_or_tuple(input):
+    return isinstance(input, (list, tuple))
+
+
+def _check_input(x, dimension):
     if len(x.shape) != dimension:
-        raise ValueError("Excepted Input X is 3-D tensor, but received {}-D {}".
-                         format(len(x.shape), type(x)))
+        raise ValueError(
+            "Excepted Input X is {}-D tensor, but received {}-D {}".format(
+                dimension, len(x.shape), type(x)))
 
 
-def check_instance(x, x_name, types=(int, float)):
+def _check_instance(x, x_name, types=(int, float)):
 
     if not isinstance(x, types):
         raise ValueError("Excepted {} type for {} but received type: {}. ".
                          format(types, x_name, type(x)))
 
 
-def update_padding1d(padding, pool_type='avg'):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, list) or isinstance(ele, tuple):
-            return True
-        return False
-
-    if is_list_or_tuple(padding):
-        if padding.__len__() == 1 and not is_list_or_tuple(padding[0]):
-            return [0, padding[0]]
-        else:
-            raise ValueError(
-                "{}_pool1d() argument 'padding' should contain one int (got {})".
-                format(pool_type, padding.__len__()))
+def _zero_padding_in_batch_and_channel(padding, channel_last):
+    if channel_last:
+        return list(padding[0]) == [0, 0] and list(padding[-1]) == [0, 0]
     else:
-        padding = [0, padding]
+        return list(padding[0]) == [0, 0] and list(padding[1]) == [0, 0]
 
-    return padding
 
+def _exclude_padding_in_batch_and_channel(padding, channel_last):
+    padding_ = padding[1:-1] if channel_last else padding[2:]
+    padding_ = [elem for pad_a_dim in padding_ for elem in pad_a_dim]
+    return padding_
 
-def update_padding2d(padding, data_format):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, list) or isinstance(ele, tuple):
-            return True
-        return False
-
-    if is_list_or_tuple(padding) and len(padding) == 4:
-        if is_list_or_tuple(padding[0]) and (data_format == "NCHW"):
-            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
-                raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[2:4]
-            padding = [ele for a_list in padding for ele in a_list]
-        elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
-            if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
-                raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[1:3]
-            padding = [ele for a_list in padding for ele in a_list]
-        padding = utils.convert_to_list(padding, 4, 'padding')
-
-        if utils._is_symmetric_padding(padding, 2):
-            padding = [padding[0], padding[2]]
-    else:
-        padding = utils.convert_to_list(padding, 2, 'padding')
-
-    return padding
 
+def _channel_last(data_format, num_dims):
+    if num_dims == 1:
+        if data_format not in ['NCL', 'NLC']:
+            raise ValueError(
+                "Attr(data_format) should be 'NCL' or 'NLC'. Received "
+                "Attr(data_format): %s" % str(data_format))
+        else:
+            return True if data_format == "NLC" else False
+    if num_dims == 2:
+        if data_format not in ['NCHW', 'NHWC']:
+            raise ValueError(
+                "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+                "Attr(data_format): %s" % str(data_format))
+        else:
+            return True if data_format == "NHWC" else False
+    if num_dims == 3:
+        if data_format not in ['NCDHW', 'NDHWC']:
+            raise ValueError(
+                "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+                "Attr(data_format): %s" % str(data_format))
+        else:
+            return True if data_format == "NDHWC" else False
 
-def update_padding3d(padding, data_format):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, (list, tuple)):
-            return True
-        return False
 
-    if is_list_or_tuple(padding) and len(padding) == 5:
-        if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"):
-            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
+                format(padding))
+        if padding == "VALID":
+            if ceil_mode != False:
                 raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[2:5]
-            padding = [ele for a_list in padding for ele in a_list]
-        elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
-            if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+
+            padding_algorithm = "VALID"
+            padding = [0] * num_dims
+        else:
+            padding_algorithm = "SAME"
+            padding = [0] * num_dims
+    elif _is_list_or_tuple(padding):
+        # for padding like
+        # [(pad_before, pad_after), (pad_before, pad_after), ...]
+        # padding for batch_dim and channel_dim included
+        if len(padding) == 2 + num_dims and _is_list_or_tuple(padding[0]):
+            if not _zero_padding_in_batch_and_channel(padding, channel_last):
                 raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[1:4]
-            padding = [ele for a_list in padding for ele in a_list]
-        padding = utils.convert_to_list(padding, 6, 'padding')
-        if utils._is_symmetric_padding(padding, 3):
-            padding = [padding[0], padding[2], padding[4]]
-
-    elif is_list_or_tuple(padding) and len(padding) == 6:
-        padding = utils.convert_to_list(padding, 6, 'padding')
-        if utils._is_symmetric_padding(padding, 3):
-            padding = [padding[0], padding[2], padding[4]]
+                    "Non-zero padding({}) in the batch or channel dimensions "
+                    "is not supported.".format(padding))
+            padding_algorithm = "EXPLICIT"
+            padding = _exclude_padding_in_batch_and_channel(padding,
+                                                            channel_last)
+            if utils._is_symmetric_padding(padding, num_dims):
+                padding = padding[0::2]
+        # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
+        elif len(padding) == 2 * num_dims and isinstance(padding[0], int):
+            padding_algorithm = "EXPLICIT"
+            padding = utils.convert_to_list(padding, 2 * num_dims, 'padding')
+            if utils._is_symmetric_padding(padding, num_dims):
+                padding = padding[0::2]
+        # for padding like [pad_d1, pad_d2, ...]
+        elif len(padding) == num_dims and isinstance(padding[0], int):
+            padding_algorithm = "EXPLICIT"
+            padding = utils.convert_to_list(padding, num_dims, 'padding')
+        else:
+            raise ValueError("Invalid padding: {}".format(padding))
+    # for integer padding
     else:
-        padding = utils.convert_to_list(padding, 3, 'padding')
+        padding_algorithm = "EXPLICIT"
+        padding = utils.convert_to_list(padding, num_dims, 'padding')
+    return padding, padding_algorithm
+
 
+def _expand_low_nd_padding(padding):
+    #1d to 2d fake input
+    if len(padding) == 2:
+        padding = [0] * 2 + padding
+    elif len(padding) == 1:
+        padding = [0] + padding
+    else:
+        raise ValueError(
+            "The size of padding's dimmention should be 1 or 2. But got padding={}".
+            format(padding))
     return padding
 
 
@@ -146,73 +168,57 @@ def avg_pool1d(x,
                count_include_pad=True,
                ceil_mode=False,
                name=None):
-    """
-
-    This operation applies a 1D average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
-
-    ..  math::
-
-       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
-
+    """ 
+    This API implements average pooling 1d operation,
+    See more details in :ref:`api_nn_pooling_AvgPool1d` .
 
     Args:
         x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
                           shape [N, C, L]. where `N` is batch size, `C` is the number of channels,
-                          `L` is the length of the feature. The data type if float32 or float64.
+                          `L` is the length of the feature. The data type is float32 or float64.
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+            it must contain an integer.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
-            then the input is implicitly zero-padded on both sides for padding number of points.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
+                          mode, default is `True`.
         ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
-            If it is set to False, the floor function will be used. Default False
+            If it is set to False, the floor function will be used. The default value is False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
+        ValueError: If `padding` is a list or tuple but its length is greater than 1.
+        ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
 
-
     Examples:
-
         .. code-block:: python
-
           import paddle
           import paddle.nn.functional as F
           paddle.disable_static()
-
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          pool_out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
-          # pool_out shape: [1, 3, 16]
-
+          out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # out shape: [1, 3, 16]
     """
     """NCL to NCHW"""
     data_format = "NCHW"
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'avg_pool1d')
-    check_input(x, 3)
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool1d')
+    _check_input(x, 3)
     x = unsqueeze(x, [2])
-    kernel_size = utils.convert_to_list(kernel_size, 1, 'pool_size')
+    kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
     kernel_size = [1] + kernel_size
     if stride is None:
         stride = kernel_size
@@ -220,33 +226,20 @@ def avg_pool1d(x,
         stride = utils.convert_to_list(stride, 1, 'pool_stride')
         stride = [1] + stride
 
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0]
+    channel_last = _channel_last("NCL", 1)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, channel_last=channel_last, ceil_mode=ceil_mode)
 
-    padding = update_padding1d(padding, "avg")
+    # use 2d to implenment 1d should expand padding in advance.
+    padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
         output = core.ops.pool2d(
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
             False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', not count_include_pad, 'ceil_mode',
-            ceil_mode, 'use_mkldnn', False, 'exclusive', True, 'data_format',
-            data_format)
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
         return squeeze(output, [2])
 
     op_type = 'pool2d'
@@ -275,126 +268,103 @@ def avg_pool1d(x,
     return squeeze(pool_out, [2])
 
 
-def max_pool1d(x,
+def avg_pool2d(x,
                kernel_size,
                stride=None,
                padding=0,
-               return_indices=False,
                ceil_mode=False,
+               count_include_pad=True,
+               divisor_override=None,
+               data_format="NCHW",
                name=None):
     """
-
-    Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
-
-    ..  math::
-
-       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
-
+    This API implements average pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_AvgPool2d` .
+ 
     Args:
-        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
-                          shape [N, C, L], where `N` is batch size, `C` is the number of channels,
-                          `L` is the length of the feature. The data type if float32 or float64.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
-            If it is set to False, the floor function will be used. Default False.
+        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If it is a tuple or list,
+            it must contain two integers, (kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The stride size. If it is a tuple or list,
+            it must contain two integers, (stride_Height, stride_Width).
+            Otherwise, the stride size will be a square of an int.
+
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
-
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
         ShapeError: If the output's shape calculated is not greater than 0.
-
-
     Examples:
-
         .. code-block:: python
-
           import paddle
           import paddle.nn.functional as F
+          import numpy as np
           paddle.disable_static()
-
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
-          # pool_out shape: [1, 3, 16]
-
-          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
-          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
-
+          # avg pool2d
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          out = F.avg_pool2d(x,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          # out.shape [1, 3, 16, 16]
     """
-    """NCL to NCHW"""
-    data_format = "NCHW"
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'max_pool1d')
-    check_input(x, 3)
-    x = unsqueeze(x, [2])
-    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
     if stride is None:
         stride = kernel_size
     else:
-        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
-
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0]
+        stride = utils.convert_to_list(stride, 2, 'pool_stride')
 
-    padding = update_padding1d(padding, 'max')
+    channel_last = _channel_last(data_format, 2)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 2, channel_last, ceil_mode=ceil_mode)
 
     if in_dygraph_mode():
-        pool_out = core.ops.max_pool2d_with_index(
-            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
-            'paddings', padding, 'padding_algorithm', padding_algorithm,
-            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
-            'exclusive', True, 'data_format', data_format)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+        output = core.ops.pool2d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
+            False, 'padding_algorithm', padding_algorithm, 'strides', stride,
+            'paddings', padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            _check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1]) / divisor_override
 
-    op_type = 'max_pool2d_with_index'
+    op_type = 'pool2d'
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
-    mask = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out, "Mask": mask}
 
     helper.append_op(
         type=op_type,
         inputs={"X": x},
-        outputs=outputs,
+        outputs={"Out": pool_out},
         attrs={
-            "pooling_type": 'max',
+            "pooling_type": "avg",
             "ksize": kernel_size,
             "global_pooling": False,
             "strides": stride,
@@ -403,335 +373,211 @@ def max_pool1d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": True,
+            "exclusive": not count_include_pad,
             "data_format": data_format,
         })
 
-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
-
-
-def adaptive_avg_pool1d(x, output_size, name=None):
-    """
-
-    This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For average adaptive pool1d:
-
-    ..  math::
-
-        lstart &= floor(i * L_{in} / L_{out})
-
-        lend &= ceil((i + 1) * L_{in} / L_{out})
-
-        Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
-
-    Args:
-        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
-                              with shape [N, C, L].  The format of input tensor is NCL,
-                              where N is batch size, C is the number of channels, L is the
-                              length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
-        name(str, optional): For detailed information, please refer
-                                 to :ref:`api_guide_Name`. Usually name is no need to set and
-                                 None by default.
-
-    Returns:
-            Tensor: The output tensor of adaptive average pooling result. The data type is same
-                      as input tensor.
-
-    Raises:
-            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
-
-    Examples:
-        .. code-block:: python
-
-              # average adaptive pool1d
-              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-              # output shape is [N, C, m], adaptive pool divide L dimension
-              # of input data into m grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(m):
-              #         lstart = floor(i * L / m)
-              #         lend = ceil((i + 1) * L / m)
-              #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-              #
-              import paddle
-              import paddle.nn.functional as F
-              paddle.disable_static()
-
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-              pool_out = F.adaptive_average_pool1d(data, output_size=16)
-              # pool_out shape: [1, 3, 16])
-    """
-    pool_type = 'avg'
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
-                             'adaptive_pool2d')
-    check_input(x, 3)
-    check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
-
-    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
-
-    l_type = "pool2d"
-    x = unsqueeze(x, [2])
-    if in_dygraph_mode():
-        pool_out = core.ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
-                                   pool_size, 'adaptive', True)
-        return squeeze(pool_out, [2])
-
-    helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-
-    outputs = {"Out": pool_out}
-    helper.append_op(
-        type=l_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
-        })
-
-    return squeeze(pool_out, [2])
+    if divisor_override is None:
+        return pool_out
+    else:
+        _check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
 
 
-def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+def avg_pool3d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               ceil_mode=False,
+               count_include_pad=False,
+               divisor_override=None,
+               data_format="NCDHW",
+               name=None):
     """
-    This operation applies a 1D adaptive max pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For max adaptive pool1d:
-
-    ..  math::
-
-        lstart &= floor(i * L_{in} / L_{out})
-
-        lend &= ceil((i + 1) * L_{in} / L_{out})
-
-        Output(i) &= max(Input[lstart:lend])}
+    This API implements average pooling 3d operation.
+    See more details in :ref:`api_nn_pooling_AvgPool3d` .
 
     Args:
-        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
-                              with shape [N, C, L].  The format of input tensor is NCL,
-                              where N is batch size, C is the number of channels, L is the
-                              length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
-        return_indices (bool): If true, the index of max pooling point will be returned along
-                with outputs. It cannot be set in average pooling type. Default False.
+        x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
+                          shape [N, C, D, H, W], where `N` represents the batch size, `C` represents
+                          the number of channels, `D`, `H` and `W` represent the depth, height and width of the feature respectively.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
-                                 to :ref:`api_guide_Name`. Usually name is no need to set and
-                                 None by default.
-
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
     Returns:
-            Tensor: The output tensor of adaptive pooling result. The data type is same
-                      as input tensor.
-
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
     Raises:
-            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
-
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
     Examples:
         .. code-block:: python
-
-              # max adaptive pool1d
-              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-              # output shape is [N, C, m], adaptive pool divide L dimension
-              # of input data into m grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(m):
-              #         lstart = floor(i * L / m)
-              #         lend = ceil((i + 1) * L / m)
-              #         output[:, :, i] = max(input[:, :, lstart: lend])
-              #
-              import paddle
-              import paddle.nn.functional as F
-              paddle.disable_static()
-
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-              pool_out = F.adaptive_max_pool1d(data, output_size=16)
-              # pool_out shape: [1, 3, 16])
-
-              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
-              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
-
+          import paddle.fluid as fluid
+          import paddle
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          # avg pool3d
+          out = paddle.nn.functional.avg_pool3d(
+                                            x,
+                                            kernel_size = 2,
+                                            stride = 2,
+                                            padding=0)
+          # out.shape: [1, 3, 16, 16, 16]
     """
-    pool_type = 'max'
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
-                             'adaptive_max_pool1d')
-    check_input(x, 3)
-    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
-    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
-
-    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
 
-    l_type = 'max_pool2d_with_index'
+    channel_last = _channel_last(data_format, 3)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
 
-    x = unsqueeze(x, [2])
     if in_dygraph_mode():
-        pool_out = core.ops.max_pool2d_with_index(
-            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+        output = core.ops.pool3d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
+            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            _check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1] *
+                             kernel_size[2]) / divisor_override
 
-    helper = LayerHelper(l_type, **locals())
+    op_type = "pool3d"
+    helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-
-    mask = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out, "Mask": mask}
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out}
 
     helper.append_op(
-        type=l_type,
+        type=op_type,
         inputs={"X": x},
         outputs=outputs,
         attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
         })
 
-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+    if divisor_override is None:
+        return pool_out
+    else:
+        _check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1] *
+                           kernel_size[2]) / divisor_override
 
 
-def max_pool2d(x,
+def max_pool1d(x,
                kernel_size,
                stride=None,
                padding=0,
                return_indices=False,
                ceil_mode=False,
-               data_format="NCHW",
                name=None):
     """
-    This operation applies 2D max pooling over input feature based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-           stride: stride
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
-                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                                                   \text{stride[1]} \times w + n)
-           $$
+    This API implements max pooling 1d opereation.
+    See more details in :ref:`api_nn_pooling_MaxPool1d` .
 
     Args:
-        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
-                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
-                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
-                          `H` is the height of the feature, and `W` is the width of the
-                          feature. The data type if float32 or float64.
+        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
+                          shape [N, C, L], where `N` is batch size, `C` is the number of channels,
+                          `L` is the length of the feature. The data type if float32 or float64.
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int.
+            it must contain an integer.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An integer, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
+
     Examples:
         .. code-block:: python
           import paddle
           import paddle.nn.functional as F
-          import numpy as np
           paddle.disable_static()
-
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          output = F.max_pool2d(input,
-                                kernel_size=2,
-                                stride=2, padding=0)
-          # output.shape [1, 3, 16, 16]
-
-          # for return_indices=True
-          output, max_indices = F.max_pool2d(input,
-                                             kernel_size=2,
-                                             stride=2,
-                                             padding=0,
-                                             return_indices=True)
-          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # pool_out shape: [1, 3, 16]
+          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
+          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
     """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
-    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
+    """NCL to NCHW"""
+    data_format = "NCHW"
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool1d')
+    _check_input(x, 3)
+    x = unsqueeze(x, [2])
+    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
     if stride is None:
         stride = kernel_size
     else:
-        stride = utils.convert_to_list(stride, 2, 'pool_stride')
+        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
 
-    if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0]
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, ceil_mode=ceil_mode)
 
-    padding = update_padding2d(padding, data_format)
+    # use 2d to implenment 1d should expand padding in advance.
+    padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
-        output = core.ops.max_pool2d_with_index(
+        pool_out = core.ops.max_pool2d_with_index(
             x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
             'paddings', padding, 'padding_algorithm', padding_algorithm,
             'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
             'exclusive', True, 'data_format', data_format)
-        return output if return_indices else output[0]
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
 
     op_type = 'max_pool2d_with_index'
     helper = LayerHelper(op_type, **locals())
@@ -758,36 +604,21 @@ def max_pool2d(x,
             "data_format": data_format,
         })
 
-    return (pool_out, mask) if return_indices else pool_out
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
 
 
-def avg_pool2d(x,
+def max_pool2d(x,
                kernel_size,
                stride=None,
                padding=0,
+               return_indices=False,
                ceil_mode=False,
-               count_include_pad=True,
-               divisor_override=None,
                data_format="NCHW",
                name=None):
     """
-    This operation applies 2D average pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
-           $$
+    This API implements max pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_MaxPool2d` .
 
     Args:
         x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
@@ -796,30 +627,26 @@ def avg_pool2d(x,
                           `H` is the height of the feature, and `W` is the width of the
                           feature. The data type if float32 or float64.
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
+            it must contain two integers, (kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            it must contain two integers, (stride_Height, stride_Width).
             Otherwise, the pool stride size will be a square of an int.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
     Raises:
@@ -832,87 +659,71 @@ def avg_pool2d(x,
           import paddle.nn.functional as F
           import numpy as np
           paddle.disable_static()
-
-          # avg pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          output = F.avg_pool2d(input,
+          # max pool2d
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          out = F.max_pool2d(x,
                                 kernel_size=2,
                                 stride=2, padding=0)
           # output.shape [1, 3, 16, 16]
-
+          # for return_indices=True
+          out, max_indices = F.max_pool2d(x,
+                                             kernel_size=2,
+                                             stride=2,
+                                             padding=0,
+                                             return_indices=True)
+          # out.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
     kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
     if stride is None:
         stride = kernel_size
     else:
         stride = utils.convert_to_list(stride, 2, 'pool_stride')
 
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0]
-
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
             "Attr(data_format): %s." % str(data_format))
-    pool_padding = update_padding2d(padding, data_format)
+
+    channel_last = True if data_format == "NHWC" else False
+
+    padding, padding_algorithm = _update_padding_nd(
+        padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode)
 
     if in_dygraph_mode():
-        output = core.ops.pool2d(
-            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
-            False, 'padding_algorithm', padding_algorithm, 'strides', stride,
-            'paddings', pool_padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
-        if divisor_override is None:
-            return output
-        else:
-            check_instance(divisor_override, "divisor_override")
-            return output * (kernel_size[0] * kernel_size[1]) / divisor_override
+        output = core.ops.max_pool2d_with_index(
+            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
+            'paddings', padding, 'padding_algorithm', padding_algorithm,
+            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
+            'exclusive', True, 'data_format', data_format)
+        return output if return_indices else output[0]
 
-    op_type = 'pool2d'
+    op_type = 'max_pool2d_with_index'
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
 
     helper.append_op(
         type=op_type,
         inputs={"X": x},
-        outputs={"Out": pool_out},
+        outputs=outputs,
         attrs={
-            "pooling_type": "avg",
+            "pooling_type": 'max',
             "ksize": kernel_size,
             "global_pooling": False,
             "strides": stride,
-            "paddings": pool_padding,
+            "paddings": padding,
             "padding_algorithm": padding_algorithm,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not count_include_pad,
+            "exclusive": True,
             "data_format": data_format,
         })
 
-    if divisor_override is None:
-        return pool_out
-    else:
-        check_instance(divisor_override, "divisor_override")
-        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
+    return (pool_out, mask) if return_indices else pool_out
 
 
 def max_pool3d(x,
@@ -924,47 +735,25 @@ def max_pool3d(x,
                data_format="NCDHW",
                name=None):
     """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, D_{in}, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-           $$
-           \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, ksize[0]-1} \max_{m=0, \ldots, ksize[1]-1} \max_{n=0, \ldots, ksize[2]-1} \\
-                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
-                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
-           $$
-
+    This API implements max pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_MaxPool3d` .
     Args:
         x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of
-                          input tensor is `"NCDHW"` or `"NDHWC"`, where `N` is batch size, `C` is
-                          the number of channels, `D` is the depth of the feature,
-                          `H` is the height of the feature, and `W` is the width
-                          of the feature.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` or `"NDHWC"`, where N represents batch size, C represents the number of channels, D, H and W represent the depth, height and width of the feature respectively. 
+        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
             Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
         return_indices (bool): Whether to return the max indices along with the outputs.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
@@ -973,7 +762,6 @@ def max_pool3d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
     Raises:
@@ -986,23 +774,20 @@ def max_pool3d(x,
           import paddle.nn.functional as F
           import numpy as np
           paddle.disable_static()
-
           # max pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          output = F.max_pool2d(input,
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output = F.max_pool2d(x,
                                 kernel_size=2,
                                 stride=2, padding=0)
           output.shape [1, 3, 16, 16, 16]
-
           # for return_indices=True
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          output, max_indices = paddle.nn.functional.max_pool3d(input,
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output, max_indices = paddle.nn.functional.max_pool3d(x,
                                         kernel_size = 2,
                                         stride = 2,
                                         padding=0,
                                         return_indices=True)
           # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
-
     """
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
     kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
@@ -1011,29 +796,10 @@ def max_pool3d(x,
     else:
         stride = utils.convert_to_list(stride, 3, 'pool_stride')
 
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0, 0]
+    channel_last = _channel_last(data_format, 3)
 
-    if data_format not in ["NCDHW", "NDHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s" % str(data_format))
-    padding = update_padding3d(padding, data_format)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
 
     if in_dygraph_mode():
         output = core.ops.max_pool3d_with_index(
@@ -1071,170 +837,83 @@ def max_pool3d(x,
     return (pool_out, mask) if return_indices else pool_out
 
 
-def avg_pool3d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               ceil_mode=False,
-               count_include_pad=False,
-               divisor_override=None,
-               data_format="NCDHW",
-               name=None):
+def adaptive_avg_pool1d(x, output_size, name=None):
     """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
-
+    This API implements adaptive average pooling 1d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
+    
     Args:
-        input (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
-                          shape [N, C, D, H, W], where `N` is batch size, `C` is
-                          the number of channels, `D` is the depth of the feature,
-                          `H` is the height of the feature, and `W` is the width
-                          of the feature.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
-            is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
-                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
         name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
     Returns:
-        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+            Tensor: The output tensor of adaptive average pooling result. The data type is same
+                      as input tensor.
     Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
+            ValueError: 'output_size' should be an integer or list or tuple with length as 1.
     Examples:
         .. code-block:: python
-          import paddle.fluid as fluid
-          import paddle
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          # avg pool3d
-          pool3d = paddle.nn.functional.avg_pool3d(
-                                            input,
-                                            kernel_size = 2,
-                                            stride = 2,
-                                            padding=0)
-          # pool3d.shape: [1, 3, 16, 16, 16]
-    """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
-    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
-    if stride is None:
-        stride = kernel_size
-    else:
-        stride = utils.convert_to_list(stride, 3, 'pool_stride')
-
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0, 0]
+              # average adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_average_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+    """
+    pool_type = 'avg'
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'adaptive_pool2d')
+    _check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
 
-    if data_format not in ["NCDHW", "NDHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s" % str(data_format))
-    padding = update_padding3d(padding, data_format)
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
 
+    l_type = "pool2d"
+    x = unsqueeze(x, [2])
     if in_dygraph_mode():
-        output = core.ops.pool3d(
-            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
-            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
-        if divisor_override is None:
-            return output
-        else:
-            check_instance(divisor_override, "divisor_override")
-            return output * (kernel_size[0] * kernel_size[1] *
-                             kernel_size[2]) / divisor_override
+        pool_out = core.ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
+                                   pool_size, 'adaptive', True)
+        return squeeze(pool_out, [2])
 
-    op_type = "pool3d"
-    helper = LayerHelper(op_type, **locals())
+    helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out}
 
+    outputs = {"Out": pool_out}
     helper.append_op(
-        type=op_type,
+        type=l_type,
         inputs={"X": x},
         outputs=outputs,
         attrs={
-            "pooling_type": 'avg',
-            "ksize": kernel_size,
-            "global_pooling": False,
-            "strides": stride,
-            "paddings": padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": not count_include_pad,
-            "data_format": data_format,
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
         })
 
-    if divisor_override is None:
-        return pool_out
-    else:
-        check_instance(divisor_override, "divisor_override")
-        return pool_out * (kernel_size[0] * kernel_size[1] *
-                           kernel_size[2]) / divisor_override
+    return squeeze(pool_out, [2])
 
 
 def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
     """
-
-    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .
-
-    For avg adaptive pool2d:
-
-    ..  math::
-
-       hstart &= floor(i * H_{in} / H_{out})
-
-       hend &= ceil((i + 1) * H_{in} / H_{out})
-
-       wstart &= floor(j * W_{in} / W_{out})
-
-       wend &= ceil((j + 1) * W_{in} / W_{out})
-
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+    This API implements adaptive average pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .
 
     Args:
         x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor.
@@ -1248,16 +927,12 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
-
     Raises:
         ValueError: If `data_format` is not "NCHW" or "NHWC".
-
     Examples:
         .. code-block:: python
-
             # adaptive avg pool2d
             # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
             # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
@@ -1279,10 +954,10 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
             input_data = np.random.rand(2, 3, 32, 32)
             x = paddle.to_tensor(input_data)
             # x.shape is [2, 3, 32, 32]
-            pool_out = paddle.nn.functional.adaptive_avg_pool2d(
+            out = paddle.nn.functional.adaptive_avg_pool2d(
                             x = x,
                             output_size=[3, 3])
-            # pool_out.shape is [2, 3, 3, 3]
+            # out.shape is [2, 3, 3, 3]
     """
     if not in_dygraph_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
@@ -1337,28 +1012,8 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
 
 def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     """
-
-    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .
-
-    For avg adaptive pool3d:
-
-    ..  math::
-
-      dstart &= floor(i * D_{in} / D_{out})
-
-      dend &= ceil((i + 1) * D_{in} / D_{out})
-
-      hstart &= floor(j * H_{in} / H_{out})
-
-      hend &= ceil((j + 1) * H_{in} / H_{out})
-
-      wstart &= floor(k * W_{in} / W_{out})
-
-      wend &= ceil((k + 1) * W_{in} / W_{out})
-
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+    This API implements adaptive average pooling 3d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .
 
     Args:
         x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
@@ -1372,16 +1027,12 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
-
     Raises:
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-
     Examples:
         .. code-block:: python
-
             # adaptive avg pool3d
             # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
             # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
@@ -1406,10 +1057,10 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
             input_data = np.random.rand(2, 3, 8, 32, 32)
             x = paddle.to_tensor(input_data)
             # x.shape is [2, 3, 8, 32, 32]
-            pool_out = paddle.nn.functional.adaptive_avg_pool3d(
+            out = paddle.nn.functional.adaptive_avg_pool3d(
                             x = x,
                             output_size=[3, 3, 3])
-            # pool_out.shape is [2, 3, 3, 3, 3]
+            # out.shape is [2, 3, 3, 3, 3]
     """
     if not in_dygraph_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
@@ -1461,3 +1112,257 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
         })
 
     return pool_out
+
+
+def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+    """
+    This API implements adaptive max pooling 1d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveMaxPool1d` .
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+                with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
+    Returns:
+            Tensor: The output tensor of adaptive pooling result. The data type is same
+                      as input tensor.
+    Raises:
+            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+    Examples:
+        .. code-block:: python
+              # max adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = max(input[:, :, lstart: lend])
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_max_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
+              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
+    """
+    pool_type = 'max'
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                             'adaptive_max_pool1d')
+    _check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
+
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+
+    l_type = 'max_pool2d_with_index'
+
+    x = unsqueeze(x, [2])
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+
+
+def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
+    """
+        This operation applies a 2D adaptive max pooling on input tensor.
+        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
+        Args:
+            x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
+            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
+            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+        Returns:
+            Tensor: The output tensor of adaptive max pool2d result. The data type is same as input tensor.
+        Examples:
+            .. code-block:: python
+              # max adaptive pool2d
+              # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
+              # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+              # of input data into m*n grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         for j in range(n):
+              #             hstart = floor(i * H / m)
+              #             hend = ceil((i + 1) * H / m)
+              #             wstart = floor(i * W / n)
+              #             wend = ceil((i + 1) * W / n)
+              #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+              #
+              import paddle
+              import numpy as np
+              paddle.disable_static()
+              input_data = np.random.rand(2, 3, 32, 32)
+              x = paddle.to_tensor(input_data)
+              # x.shape is [2, 3, 32, 32]
+              out = paddle.nn.functional.adaptive_max_pool2d(
+                            x = x,
+                            output_size=[3, 3])
+              # out.shape is [2, 3, 3, 3]
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_max_pool2d')
+    _check_input(x, 4)
+    #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool2d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool2d')
+
+    in_h, in_w = x.shape[2:4]
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 2, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_h
+        if output_size[1] == None:
+            output_size[1] = in_w
+
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
+        return pool_out if return_indices else pool_out[0]
+
+    l_type = 'max_pool2d_with_index'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        })
+    #return (pool_out, mask) if return_indices else pool_out
+    return pool_out
+
+
+def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
+    """
+        This operation applies a 3D adaptive max pooling on input tensor.
+        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
+        Args:
+            x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
+            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+        Returns:
+            Tensor: The output tensor of adaptive max pool3d result. The data type is same as input tensor.
+        Examples:
+            .. code-block:: python
+              # adaptive max pool3d
+              # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
+              # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+              # of input data into m*n grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(l):
+              #         for j in range(m):
+              #             for k in range(n):
+              #                 dstart = floor(i * D / l)
+              #                 dend = ceil((i + 1) * D / l)
+              #                 hstart = floor(i * H / m)
+              #                 hend = ceil((i + 1) * H / m)
+              #                 wstart = floor(i * W / n)
+              #                 wend = ceil((i + 1) * W / n)
+              #             output[:, :, i, j, k] = max(input[:, :, dstart: dend, hstart: hend, wstart: wend])
+              #
+              import paddle
+              import numpy as np
+              paddle.disable_static()
+              input_data = np.random.rand(2, 3, 8, 32, 32)
+              x = paddle.to_tensor(input_data)
+              # x.shape is [2, 3, 8, 32, 32]
+              out = paddle.nn.functional.adaptive_max_pool3d(
+                            x = x,
+                            output_size=[3, 3, 3])
+              # out.shape is [2, 3, 3, 3, 3]
+    """
+
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_max_pool3d')
+    _check_input(x, 5)
+    #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool3d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool3d')
+
+    in_l, in_h, in_w = x.shape[2:5]
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 3, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_l
+        if output_size[1] == None:
+            output_size[1] = in_h
+        if output_size[2] == None:
+            output_size[2] = in_w
+
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool3d_with_index(
+            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
+        return pool_out if return_indices else pool_out[0]
+
+    l_type = 'max_pool3d_with_index'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        })
+
+    return (pool_out, mask) if return_indices else pool_out
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 3399e4e34c9e3bc61fde515fc1917deb213f3d0b..6eac15cd694e51c24f94f7686b6e63fa7c6cbf09 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -66,16 +66,18 @@ from .common import Dropout  #DEFINE_ALIAS
 from .common import Dropout2D  #DEFINE_ALIAS
 from .common import Dropout3D  #DEFINE_ALIAS
 from .common import AlphaDropout  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
 from .pooling import AvgPool1d  #DEFINE_ALIAS
-from .pooling import MaxPool1d  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
 from .pooling import AvgPool2d  #DEFINE_ALIAS
-from .pooling import MaxPool2d  #DEFINE_ALIAS
 from .pooling import AvgPool3d  #DEFINE_ALIAS
+from .pooling import MaxPool1d  #DEFINE_ALIAS
+from .pooling import MaxPool2d  #DEFINE_ALIAS
 from .pooling import MaxPool3d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool2d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool3d  #DEFINE_ALIAS
 from .conv import Conv1d  #DEFINE_ALIAS
 from .conv import Conv2d  #DEFINE_ALIAS
 from .conv import Conv3d  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 7d0e59fb7575c9d15d28e88a462aed4ddba47fb9..4e342c00528a2c0115940bb7f695e1ed5b582382 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -99,7 +99,8 @@ class _ConvNd(layers.Layer):
                 raise ValueError("in_channels must be divisible by groups.")
 
             if padding_mode in {'reflect', 'replicate', 'circular'}:
-                _paired_padding = utils.convert_to_list(padding, 2, 'padding')
+                _paired_padding = utils.convert_to_list(padding, dims,
+                                                        'padding')
                 self._reversed_padding_repeated_twice = _reverse_repeat_list(
                     _paired_padding, 2)
 
@@ -318,62 +319,80 @@ class Conv2d(_ConvNd):
     output of the convolution, and the corresponding activation function is
     applied to the final result.
     For each input :math:`X`, the equation is:
-    .. math::
-        Out = \\sigma (W \\ast X + b)
+
+    ..  math::
+
+        Out = \sigma (W \\ast X + b)
+
     Where:
+
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    
     Parameters:
-        in_channels(int): The number of channels in the input image.
-        out_channels(int): The number of channels produced by convolution.
-        kernel_size (int|list|tuple): The size of convolution kernel.
-        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`on both sides 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` .
-        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: 1.
-        groups (int, optional): The groups number of the Conv2d Layer. According to grouped
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+            connected to the second half of the input channels. The default value is 1.
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d.
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+            is not set, the bias is initialized zero. The default value is None.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filter of this layer.
+
         **bias** (Parameter or None): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel_size[0] - 1) + 1))}{strides[0]} + 1 \\\\
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel_size[1] - 1) + 1))}{strides[1]} + 1
+
+        ..  math::
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
     Examples:
+
         .. code-block:: python
+
           import numpy as np
           import paddle
           import paddle.nn as nn
@@ -646,35 +665,29 @@ class ConvTranspose2d(_ConvNd):
     The details of convolution transpose layer, please refer to the following explanation and references
     `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
     For each input :math:`X`, the equation is:
-    .. math::
+
+    ..  math::
+
         Out = \sigma (W \\ast X + b)
+
     Where:
+
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+    
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
         kernel_size(int|list|uple): The kernel size. If kernel_size is a tuple,
             it must contain two integers, (kernel_size_H, kernel_size_W).
             Otherwise, the kernel will be a square.
-        output_padding(int|list|tuple, optional): Additional size added to one side
-            of each dimension in the output shape. Default: 0.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides 
@@ -682,9 +695,8 @@ class ConvTranspose2d(_ConvNd):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
         dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
@@ -694,29 +706,46 @@ class ConvTranspose2d(_ConvNd):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d_transpose.
+        bias_attr(ParamAttr|bool, optional): The attribute for the bias of conv2d_transpose.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filters of this layer.
+
         **bias** (Parameter or None): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
+
+        ..  math::
+
+           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel\_size[0] - 1) + 1
+
+           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel\_size[1] - 1) + 1
+
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] )
+
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+
     Examples:
+
        .. code-block:: python
+
           import numpy as np
           import paddle
           import paddle.nn as nn
@@ -791,66 +820,86 @@ class Conv3d(_ConvNd):
     provided, bias is added to the output of the convolution, and the
     corresponding activation function is applied to the final result.
     For each input :math:`X`, the equation is:
-    .. math::
+
+    ..  math::
+
         Out = \sigma (W \\ast X + b)
+
     In the above equation:
+
     * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
     Parameters:
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
-        kernel_size (int|list|tuple, optional): The size of the convolving kernel.
-        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
             contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
             stride_D = stride_H = stride_W = stride. The default value is 1.
-        padding (int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups (int, optional): The groups number of the Conv3d Layer. According to grouped
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. The default value is 1.
-        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
             :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. The default value is None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filters of this layer.
+
         **bias** (Parameter): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
+
+        ..  math::
+
+           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
+
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
+
     Examples:
+
         .. code-block:: python
+
           import numpy as np
           
           import paddle
@@ -936,17 +985,22 @@ class ConvTranspose3d(_ConvNd):
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
     For each input :math:`X`, the equation is:
-    .. math::
+    
+    ..  math::
+
         Out = \sigma (W \\ast X + b)
+
     In the above equation:
+
     * :math:`X`: Input value, a tensor with NCDHW format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
+
     **Note**:
+
           The conv_transpose3d can be seen as the backward of the conv3d. For conv3d, 
           when stride > 1, conv3d maps multiple input shape to the same output shape, 
           so for conv_transpose3d, when stride > 1, input shape maps multiple output shape.
@@ -957,6 +1011,7 @@ class ConvTranspose3d(_ConvNd):
           and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
           between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
           conv_transpose3d can compute the kernel size automatically.
+
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
@@ -985,11 +1040,11 @@ class ConvTranspose3d(_ConvNd):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             The default value is 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
@@ -999,24 +1054,38 @@ class ConvTranspose3d(_ConvNd):
             filter_size, padding, and stride to calculate output_size.
             if output_size and filter_size are specified at the same time, They
             should follow the formula above. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filters of this layer.
+
         **bias** (Parameter): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
-           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel_size[2] - 1) + 1 \\\\
+
+        ..  math::
+
+           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel\_size[0] - 1) + 1
+           
+           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel\_size[1] - 1) + 1
+           
+           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel\_size[2] - 1) + 1
+           
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
     Examples:
+
        .. code-block:: python
+
           import numpy as np
           import paddle
           import paddle.nn as nn
@@ -1024,7 +1093,7 @@ class ConvTranspose3d(_ConvNd):
           
           paddle.disable_static()
           x_var = paddle.to_tensor(x)
-          conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
+          conv = nn.ConvTranspose3d(4, 6, (3, 3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
           print(y_np.shape)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index de10e77eb1c000e66a7a914dc94ce39a6268bb61..a1c7d28a85e762ebb381c5f0075df1c7b00396f7 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -634,9 +634,12 @@ class KLDivLoss(fluid.dygraph.Layer):
             Default is ``'mean'``.
 
     Shape:
-      - input: (N, *) where * means, any number of additional dimensions.
-      - label: (N, *), same shape as input
-      - output: tensor with shape: (1) by default.
+
+        - input (Tensor): (N, *), where * means, any number of additional dimensions.
+
+        - label (Tensor): (N, *), same shape as input.
+
+        - output (Tensor): tensor with shape: [1] by default.
 
 
     Examples:
@@ -646,7 +649,7 @@ class KLDivLoss(fluid.dygraph.Layer):
             import numpy as np
             import paddle.nn as nn
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
             shape = (5, 20)
             x = np.random.uniform(-10, 10, shape).astype('float32')
@@ -654,26 +657,26 @@ class KLDivLoss(fluid.dygraph.Layer):
 
             # 'batchmean' reduction, loss shape will be [N]
             kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[5]
 
             # 'mean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='mean')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[1]
 
             # 'sum' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='sum')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[1]
 
             # 'none' reduction, loss shape is same with X shape
             kldiv_criterion = nn.KLDivLoss(reduction='none')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[5, 20]
     """
 
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index c7855b23bf6e6861326533e3cc93d7f7c5bd4ca2..4d25418579d74ae896f8ca590400a0a334047e93 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -27,6 +27,7 @@
 
 # TODO: define normalization api  
 
+import six
 from ...fluid.dygraph.nn import InstanceNorm
 
 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
@@ -36,7 +37,6 @@ from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
 
 from ...fluid.dygraph import layers
-
 from ...framework import get_default_dtype, set_default_dtype
 from ...fluid.framework import in_dygraph_mode
 
@@ -50,6 +50,7 @@ from ..functional import batch_norm, layer_norm, instance_norm
 import numpy as np
 import numbers
 import warnings
+from ...fluid.dygraph.base import no_grad
 
 __all__ = [
     'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
@@ -566,17 +567,28 @@ class _BatchNormBase(layers.Layer):
         param_shape = [num_features]
 
         # create parameter
-        self.weight = self.create_parameter(
-            attr=self._weight_attr,
-            shape=param_shape,
-            default_initializer=Constant(1.0))
-        self.weight.stop_gradient = (self._weight_attr is False) or (
-            self._weight_attr and self._weight_attr.learning_rate == 0.)
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
 
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=param_shape, is_bias=True)
-        self.bias.stop_gradient = (self._bias_attr is False) or (
-            self._bias_attr and self._bias_attr.learning_rate == 0.)
+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
 
         moving_mean_name = None
         moving_variance_name = None
@@ -611,6 +623,7 @@ class _BatchNormBase(layers.Layer):
         self._epsilon = epsilon
         self._fuse_with_relu = False
         self._track_running_stats = track_running_stats
+        self._name = name
 
     def _check_input_dim(self, input):
         raise NotImplementedError("BatchNorm Base error")
@@ -898,7 +911,7 @@ class BatchNorm3d(_BatchNormBase):
                 len(input.shape)))
 
 
-class SyncBatchNorm(layers.Layer):
+class SyncBatchNorm(_BatchNormBase):
     """
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
     It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
@@ -984,72 +997,16 @@ class SyncBatchNorm(layers.Layer):
 
     def __init__(self,
                  num_features,
-                 epsilon=1e-05,
                  momentum=0.9,
-                 track_running_stats=True,
+                 epsilon=1e-05,
                  weight_attr=None,
                  bias_attr=None,
                  data_format='NCHW',
+                 track_running_stats=True,
                  name=None):
-        super(SyncBatchNorm, self).__init__()
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._num_features = num_features
-        self._data_layout = data_format
-        self._momentum = momentum
-        self._epsilon = epsilon
-        self._track_running_stats = track_running_stats
-
-        if self._track_running_stats == False:
-            warnings.warn(
-                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
-            )
-
-        param_shape = [self._num_features]
-
-        # create parameter
-        if weight_attr == False:
-            self.weight = self.create_parameter(
-                attr=None, shape=param_shape, default_initializer=Constant(1.0))
-            self.weight.stop_gradient = True
-        else:
-            self.weight = self.create_parameter(
-                attr=self._weight_attr,
-                shape=param_shape,
-                default_initializer=Constant(1.0))
-            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        if bias_attr == False:
-            self.bias = self.create_parameter(
-                attr=None,
-                shape=param_shape,
-                default_initializer=Constant(0.0),
-                is_bias=True)
-            self.bias.stop_gradient = True
-        else:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr, shape=param_shape, is_bias=True)
-            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        self._mean = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._mean.stop_gradient = True
-
-        self._variance = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._variance.stop_gradient = True
+        super(SyncBatchNorm,
+              self).__init__(num_features, momentum, epsilon, weight_attr,
+                             bias_attr, data_format, track_running_stats, name)
 
     def forward(self, x):
         # create output
@@ -1063,7 +1020,7 @@ class SyncBatchNorm(layers.Layer):
         if in_dygraph_mode():
             attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                      "is_test", not self.training, "data_layout",
-                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
+                     self._data_format, "use_mkldnn", False, "fuse_with_relu",
                      False, "use_global_stats", False, 'trainable_statistics',
                      False)
             sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
@@ -1073,13 +1030,13 @@ class SyncBatchNorm(layers.Layer):
             return sync_batch_norm_out
 
         check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'BatchNorm')
+                                 'SyncBatchNorm')
 
         attrs = {
             "momentum": self._momentum,
             "epsilon": self._epsilon,
             "is_test": not self.training,
-            "data_layout": self._data_layout,
+            "data_layout": self._data_format,
             "use_mkldnn": False,
             "fuse_with_relu": False,
             "use_global_stats": False,
@@ -1112,3 +1069,45 @@ class SyncBatchNorm(layers.Layer):
         self._helper.append_op(
             type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
         return sync_batch_norm_out
+
+    @classmethod
+    def convert_sync_batchnorm(cls, layer):
+        """
+        Helper function to convert :class: `paddle.nn.BatchNorm*d` layers in the model to :class: `paddle.nn.SyncBatchNorm` layers.
+
+        Parameters:
+            layer(paddle.nn.Layer): model containing one or more `BatchNorm*d` layers.
+
+        Returns:
+            The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.
+
+        Examples:
+
+            .. code-block:: python
+                import paddle
+                import paddle.nn as nn
+
+                paddle.disable_static()
+                model = nn.Sequential(nn.Conv2d(3, 5, 3), nn.BatchNorm2d(5))
+                sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+        """
+        layer_output = layer
+        if isinstance(layer, _BatchNormBase):
+            layer_output = SyncBatchNorm(layer._num_features, layer._epsilon,
+                                         layer._momentum, layer._weight_attr,
+                                         layer._bias_attr, layer._data_format,
+                                         layer._name)
+
+            if layer._weight_attr != False and layer._bias_attr != False:
+                with no_grad():
+                    layer_output.weight = layer.weight
+                    layer_output.bias = layer.bias
+            layer_output._mean = layer._mean
+            layer_output._variance = layer._variance
+
+        for name, sublayer in layer.named_sublayers():
+            layer_output.add_sublayer(name,
+                                      cls.convert_sync_batchnorm(sublayer))
+        del layer
+        return layer_output
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 87fa0caec9ee287c42d8308d9da25c6d2fc9b911..6f6b567849732ff889db4507708758cd8eeab2a8 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -12,198 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-
-from ...fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
-from ...fluid.layers import utils
 from ...fluid.dygraph import layers
 from ...fluid.layer_helper import LayerHelper
 from .. import functional as F
 
 __all__ = [
-    'AdaptiveAvgPool2d',
-    'AdaptiveAvgPool3d',
     'AvgPool1d',
-    'maxPool1d',
-    'AdaptiveMaxPool1d',
-    'AdaptiveAvgPool1d',
     'AvgPool2d',
-    'MaxPool2d',
     'AvgPool3d',
+    'MaxPool1d',
+    'MaxPool2d',
     'MaxPool3d',
+    'AdaptiveAvgPool1d',
+    'AdaptiveAvgPool2d',
+    'AdaptiveAvgPool3d',
+    'AdaptiveMaxPool1d',
+    'AdaptiveMaxPool2d',
+    'AdaptiveMaxPool3d',
 ]
 
 
-class AdaptiveAvgPool2d(layers.Layer):
-    """
-
-    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-
-    For avg adaptive pool2d:
-
-    ..  math::
-
-       hstart &= floor(i * H_{in} / H_{out})
-
-       hend &= ceil((i + 1) * H_{in} / H_{out})
-
-       wstart &= floor(j * W_{in} / W_{out})
-
-       wend &= ceil((j + 1) * W_{in} / W_{out})
-
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
-
-
-    Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two element, (H, W). H and W can be either a int, or None which means
-            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
-            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
-            the order of: [batch_size, input_channels, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-    Shape:
-        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32 or float64.
-        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
-
-    Returns:
-        A callable object of AdaptiveAvgPool2d.
-
-    Examples:
-        .. code-block:: python
-
-            # adaptive avg pool2d
-            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
-            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
-            # of input data into m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(m):
-            #         for j in range(n):
-            #             hstart = floor(i * H / m)
-            #             hend = ceil((i + 1) * H / m)
-            #             wstart = floor(i * W / n)
-            #             wend = ceil((i + 1) * W / n)
-            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
-            #
-            import paddle
-            import numpy as np
-            paddle.disable_static()
-            input_data = np.random.rand(2, 3, 32, 32)
-            x = paddle.to_tensor(input_data)
-            # x.shape is [2, 3, 32, 32]
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
-            pool_out = adaptive_avg_pool(x = x)
-            # pool_out.shape is [2, 3, 3, 3]
-    """
-
-    def __init__(self, output_size, data_format="NCHW", name=None):
-        super(AdaptiveAvgPool2d, self).__init__()
-        self._output_size = output_size
-        self._data_format = data_format
-        self._name = name
-
-    def forward(self, x):
-        return F.adaptive_avg_pool2d(
-            x,
-            output_size=self._output_size,
-            data_format=self._data_format,
-            name=self._name)
-
-
-class AdaptiveAvgPool3d(layers.Layer):
-    """
-
-    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-
-    For avg adaptive pool3d:
-
-    ..  math::
-
-      dstart &= floor(i * D_{in} / D_{out})
-
-      dend &= ceil((i + 1) * D_{in} / D_{out})
-
-      hstart &= floor(j * H_{in} / H_{out})
-
-      hend &= ceil((j + 1) * H_{in} / H_{out})
-
-      wstart &= floor(k * W_{in} / W_{out})
-
-      wend &= ceil((k + 1) * W_{in} / W_{out})
-
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-
-
-    Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
-            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
-            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
-            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-    Shape:
-        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32 or float64.
-        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
-
-    Returns:
-        A callable object of AdaptiveAvgPool3d.
-
-    Examples:
-        .. code-block:: python
-
-            # adaptive avg pool3d
-            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
-            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
-            # of input data into l * m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(l):
-            #         for j in range(m):
-            #             for k in range(n):
-            #                 dstart = floor(i * D / l)
-            #                 dend = ceil((i + 1) * D / l)
-            #                 hstart = floor(j * H / m)
-            #                 hend = ceil((j + 1) * H / m)
-            #                 wstart = floor(k * W / n)
-            #                 wend = ceil((k + 1) * W / n)
-            #                 output[:, :, i, j, k] =
-            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
-            import paddle
-            import numpy as np
-            paddle.disable_static()
-            input_data = np.random.rand(2, 3, 8, 32, 32)
-            x = paddle.to_tensor(input_data)
-            # x.shape is [2, 3, 8, 32, 32]
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
-            pool_out = adaptive_avg_pool(x = x)
-            # pool_out = [2, 3, 3, 3, 3]
-    """
-
-    def __init__(self, output_size, data_format="NCDHW", name=None):
-        super(AdaptiveAvgPool3d, self).__init__()
-        self._output_size = output_size
-        self._data_format = data_format
-        self._name = name
-
-    def forward(self, x):
-        return F.adaptive_avg_pool3d(
-            x,
-            output_size=self._output_size,
-            data_format=self._data_format,
-            name=self._name)
-
-
 class AvgPool1d(layers.Layer):
     """
     This operation applies a 1D average pooling over an input signal composed
@@ -223,17 +51,20 @@ class AvgPool1d(layers.Layer):
 
     Args:
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+            it must contain an integer.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
-            then the input is implicitly zero-padded on both sides for padding number of points.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
+                          mode, default is `True`.
         ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
-            If it is set to False, the floor function will be used. Default False
+            If it is set to False, the floor function will be used. The default value is False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -245,10 +76,14 @@ class AvgPool1d(layers.Layer):
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
+        ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
 
 
+    Shape:
+        - inpuut: 3-D tensor.
+        - output: 3-D tensor
+
     Examples:
 
         .. code-block:: python
@@ -284,63 +119,74 @@ class AvgPool1d(layers.Layer):
         return out
 
 
-class MaxPool1d(layers.Layer):
+class AvgPool2d(layers.Layer):
     """
-    Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
+    This operation applies 2D average pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
 
-    ..  math::
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
 
-       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           $$
 
     Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
-            If it is set to False, the floor function will be used. Default False
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
 
-    Returns:
-        None.
+    Shape:
+        - x: 4-D tensor.
+        - out: 2-D tensor
 
+    Returns: None.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
         ShapeError: If the output's shape calculated is not greater than 0.
-
-
     Examples:
-
         .. code-block:: python
-
           import paddle
           import paddle.nn as nn
+          import numpy as np
           paddle.disable_static()
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
-          pool_out = MaxPool1d(data)
-          # pool_out shape: [1, 3, 16]
-
-          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
-          pool_out, indices = MaxPool1d(data)
-          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          AvgPool2d = nn.AvgPool2d(kernel_size=2,
+                                stride=2, padding=0)
+          output = AvgPoo2d(input)
+          # output.shape [1, 3, 16, 16]
 
     """
 
@@ -348,113 +194,155 @@ class MaxPool1d(layers.Layer):
                  kernel_size,
                  stride=None,
                  padding=0,
-                 return_indices=False,
                  ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCHW",
                  name=None):
-        super(MaxPool1d, self).__init__()
-        self.kernel_size = kernel_size
+        super(AvgPool2d, self).__init__()
+        self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
         self.ceil_mode = ceil_mode
-        self.return_indices = return_indices
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
         self.name = name
 
-    def forward(self, input):
-        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
-                           self.return_indices, self.ceil_mode, self.name)
-        return out
+    def forward(self, x):
+        return F.avg_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
 
 
-class AdaptiveAvgPool1d(layers.Layer):
+class AvgPool3d(layers.Layer):
     """
-
-    This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For average adaptive pool1d:
-
-    ..  math::
-
-       lstart &= floor(i * L_{in} / L_{out})
-
-       lend &= ceil((i + 1) * L_{in} / L_{out})
-
-       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
     Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one int.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
 
-    Returns:
-        None.
-
+    Returns: None.
     Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 5-D tensor.
+        - out: 5-D tensor.
 
     Examples:
         .. code-block:: python
-
-          # average adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-          #
           import paddle
           import paddle.nn as nn
+          import numpy as np
           paddle.disable_static()
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
-          pool_out = AdaptiveAvgPool1d(data)
-          # pool_out shape: [1, 3, 16]
+          # avg pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+          AvgPool3d = nn.AvgPool3d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = AvgPool3d(input)
+          # output.shape [1, 2, 3, 16, 16]
+
     """
 
-    def __init__(self, output_size, name=None):
-        super(AdaptiveAvgPool1d, self).__init__()
-        self.output_size = output_size
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding=0,
+                 ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCDHW",
+                 name=None):
+        super(AvgPool3d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
         self.name = name
 
-    def forward(self, input):
-        return F.adaptive_avg_pool1d(input, self.output_size, self.name)
+    def forward(self, x):
+        return F.avg_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
 
 
-class AdaptiveMaxPool1d(layers.Layer):
+class MaxPool1d(layers.Layer):
     """
-
-    This operation applies a 1D adaptive max pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
+    Applies a 1D max pooling over an input signal composed of several input planes based
+    on the input, output_size, return_indices parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
     size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
 
-    For max adaptive pool1d:
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
 
     ..  math::
 
-       lstart &= floor(i * L_{in} / L_{out})
-
-       lend &= ceil((i + 1) * L_{in} / L_{out})
-
-       Output(i) &= max(Input[lstart:lend])}
+       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
 
     Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-             it must contain one int.
-        return_indices (bool): If true, the index of max pooling point will be returned along
-            with outputs. It cannot be set in average pooling type. Default False.
+       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An integer, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -462,53 +350,60 @@ class AdaptiveMaxPool1d(layers.Layer):
         None.
 
     Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Shape:
+        - x: 3-D tensor.
+        - out: 3-D tensor.
 
     Examples:
+
         .. code-block:: python
 
-          # max adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = max(input[:, :, lstart: lend])
-          #
-                    import paddle
+          import paddle
           import paddle.nn as nn
           paddle.disable_static()
 
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
-          pool_out = AdaptiveMaxPool1d(data)
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
+          pool_out = MaxPool1d(data)
           # pool_out shape: [1, 3, 16]
 
-          # for return_indices = true
-          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
-          pool_out, indices = AdaptiveMaxPool1d(data)
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
+          pool_out, indices = MaxPool1d(data)
           # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
-    def __init__(self, output_size, return_indices=False, name=None):
-        super(AdaptiveMaxPool1d, self).__init__()
-        self.output_size = output_size
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 return_indices=False,
+                 ceil_mode=False,
+                 name=None):
+        super(MaxPool1d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
         self.return_indices = return_indices
         self.name = name
 
     def forward(self, input):
-        return F.adaptive_max_pool1d(input, self.output_size,
-                                     self.return_indices, self.name)
+        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
+                           self.return_indices, self.ceil_mode, self.name)
+        return out
 
 
-class AvgPool2d(layers.Layer):
+class MaxPool2d(layers.Layer):
     """
-    This operation applies 2D average pooling over input features based on the input,
+    This operation applies 2D max pooling over input feature based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
     in NCHW format, where N is batch size, C is the number of channels,
     H is the height of the feature, and W is the width of the feature.
@@ -522,8 +417,9 @@ class AvgPool2d(layers.Layer):
       Output:
            Out shape: $(N, C, H_{out}, W_{out})$
            $$
-           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
            $$
 
     Args:
@@ -532,31 +428,33 @@ class AvgPool2d(layers.Layer):
             Otherwise, the pool kernel size will be a square of an int.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
             it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
+            Otherwise, the pool stride size will be a square of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        return_indices (bool): Whether to return the max indices along with the outputs.
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
 
-    Returns: None.
+    Returns: None
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 4-D tensor.
+        - out: 4-D tensor.
+
     Examples:
         .. code-block:: python
           import paddle
@@ -566,172 +464,72 @@ class AvgPool2d(layers.Layer):
 
           # max pool2d
           input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          AvgPool2d = nn.AvgPool2d(kernel_size=2,
-                                stride=2, padding=0)
-          output = AvgPoo2d(input)
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = MaxPool2d(input)
           # output.shape [1, 3, 16, 16]
 
+          # for return_indices=True
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool2d(input)
+          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
 
     def __init__(self,
                  kernel_size,
                  stride=None,
                  padding=0,
+                 return_indices=False,
                  ceil_mode=False,
-                 count_include_pad=True,
-                 divisor_override=None,
                  data_format="NCHW",
                  name=None):
-        super(AvgPool2d, self).__init__()
+        super(MaxPool2d, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
+        self.return_indices = return_indices
         self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
-        self.divisor = divisor_override
         self.data_format = data_format
         self.name = name
 
     def forward(self, x):
-        return F.avg_pool2d(
+        return F.max_pool2d(
             x,
             kernel_size=self.ksize,
             stride=self.stride,
             padding=self.padding,
-            ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
-            divisor_override=self.divisor,
+            return_indices=self.return_indices,
             data_format=self.data_format,
             name=self.name)
 
 
-class MaxPool2d(layers.Layer):
+class MaxPool3d(layers.Layer):
     """
-    This operation applies 2D max pooling over input feature based on the input,
+    This operation applies 3D max pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
-                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                                                   \text{stride[1]} \times w + n)
-           $$
-
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-    Returns: None
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
-    Examples:
-        .. code-block:: python
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-          paddle.disable_static()
-
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          MaxPool2d = nn.MaxPool2d(kernel_size=2,
-                                   stride=2, padding=0)
-          output = MaxPool2d(input)
-          # output.shape [1, 3, 16, 16]
-
-          # for return_indices=True
-          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
-          output, max_indices = MaxPool2d(input)
-          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
-    """
-
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 return_indices=False,
-                 ceil_mode=False,
-                 data_format="NCHW",
-                 name=None):
-        super(MaxPool2d, self).__init__()
-        self.ksize = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.return_indices = return_indices
-        self.ceil_mode = ceil_mode
-        self.data_format = data_format
-        self.name = name
-
-    def forward(self, x):
-        return F.max_pool2d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            return_indices=self.return_indices,
-            data_format=self.data_format,
-            name=self.name)
-
-
-class MaxPool3d(layers.Layer):
-    """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
     Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int. Default kernel_size.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): when True, will use ceil instead of floor to compute the output shape.
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -742,6 +540,11 @@ class MaxPool3d(layers.Layer):
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 5-D tensor.
+        - out: 5-D tensor.
+
     Examples:
         .. code-block:: python
           import paddle
@@ -790,88 +593,457 @@ class MaxPool3d(layers.Layer):
             name=self.name)
 
 
-class AvgPool3d(layers.Layer):
+class AdaptiveAvgPool1d(layers.Layer):
     """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    This operation applies a 1D adaptive average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For average adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
 
     Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
-            is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one int.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
 
-    Returns: None.
+    Returns:
+        None.
+
     Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Shape:
+        - x: 3-D tensor.
+        - out: 3-D tensor.
+
     Examples:
         .. code-block:: python
+
+          # average adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+          #
           import paddle
           import paddle.nn as nn
-          import numpy as np
           paddle.disable_static()
 
-          # avg pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          AvgPool3d = nn.AvgPool3d(kernel_size=2,
-                                   stride=2, padding=0)
-          output = AvgPool3d(input)
-          # output.shape [1, 2, 3, 16, 16]
-
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
+          pool_out = AdaptiveAvgPool1d(data)
+          # pool_out shape: [1, 3, 16]
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride,
-                 padding=0,
-                 ceil_mode=False,
-                 count_include_pad=True,
-                 divisor_override=None,
-                 data_format="NCDHW",
-                 name=None):
-        super(AvgPool3d, self).__init__()
-        self.ksize = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
-        self.divisor = divisor_override
-        self.data_format = data_format
+    def __init__(self, output_size, name=None):
+        super(AdaptiveAvgPool1d, self).__init__()
+        self.output_size = output_size
         self.name = name
 
-    def forward(self, x):
-        return F.avg_pool3d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
-            divisor_override=self.divisor,
-            data_format=self.data_format,
-            name=self.name)
+    def forward(self, input):
+        return F.adaptive_avg_pool1d(input, self.output_size, self.name)
+
+
+class AdaptiveAvgPool2d(layers.Layer):
+    """
+
+    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two element, (H, W). H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
+            the order of: [batch_size, input_channels, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool2d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out.shape is [2, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCHW", name=None):
+        super(AdaptiveAvgPool2d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool2d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AdaptiveAvgPool3d(layers.Layer):
+    """
+
+    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
+            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool3d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 8, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out = [2, 3, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCDHW", name=None):
+        super(AdaptiveAvgPool3d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool3d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AdaptiveMaxPool1d(layers.Layer):
+    """
+
+    This operation applies a 1D adaptive max pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For max adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= max(Input[lstart:lend])}
+
+    Args:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+             it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Returns:
+        None.
+
+    Raises:
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type is same as input x.
+
+    Examples:
+        .. code-block:: python
+
+          # max adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = max(input[:, :, lstart: lend])
+          #
+                    import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
+          pool_out = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+          # for return_indices = true
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
+          pool_out, indices = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool1d, self).__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
+        self.name = name
+
+    def forward(self, input):
+        return F.adaptive_max_pool1d(input, self.output_size,
+                                     self.return_indices, self.name)
+
+
+class AdaptiveMaxPool2d(layers.Layer):
+    """
+    This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    For adaptive max pool2d:
+    ..  math::
+       hstart &= floor(i * H_{in} / H_{out})
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+       wstart &= floor(j * W_{in} / W_{out})
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+       Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
+        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x.
+    
+    Returns:
+        A callable object of AdaptiveMaxPool2d.
+    Examples:
+        .. code-block:: python
+            # adaptive max pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=3, return_indices=True)
+            pool_out, indices = adaptive_max_pool(x = x)
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool2d, self).__init__()
+        self._output_size = output_size
+        self._return_indices = return_indices
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_max_pool2d(
+            x,
+            output_size=self._output_size,
+            return_indices=self._return_indices,
+            name=self._name)
+
+
+class AdaptiveMaxPool3d(layers.Layer):
+    """
+   This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    For adaptive max pool3d:
+    ..  math::
+      dstart &= floor(i * D_{in} / D_{out})
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+      hstart &= floor(j * H_{in} / H_{out})
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+      wstart &= floor(k * W_{in} / W_{out})
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+      Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type is same as input x.
+    Returns:
+        A callable object of AdaptiveMaxPool3d.
+    Examples:
+        .. code-block:: python
+            # adaptive max pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     max(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            pool = paddle.nn.AdaptiveMaxPool3d(output_size=4)
+            out = pool(x)
+            # out shape: [2, 3, 4, 4, 4]
+            pool, indices = paddle.nn.AdaptiveMaxPool3d(output_size=3, return_indices=True)
+            out = pool(x)
+            # out shape: [2, 3, 4, 4, 4], indices shape: [2, 3, 4, 4, 4]
+            
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool3d, self).__init__()
+        self._output_size = output_size
+        self._return_indices = return_indices
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_max_pool3d(
+            x,
+            output_size=self._output_size,
+            return_indices=self._return_indices,
+            name=self._name)
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 49314c9832dd389411dffb3f498b34d09337a3f0..095a34cb6fc68cda6900790141d226208b203f82 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -26,9 +26,8 @@ __all__ = [
 ]
 
 
-from ..fluid.optimizer import  SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \
-            Ftrl, Adadelta, \
-            SGDOptimizer, MomentumOptimizer, AdagradOptimizer,DpsgdOptimizer,\
+from ..fluid.optimizer import Momentum, Adagrad, Dpsgd, DecayedAdagrad, Ftrl,\
+            AdagradOptimizer,DpsgdOptimizer,\
             DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
             ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
             ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
@@ -39,6 +38,9 @@ from .adam import Adam
 from .adamw import AdamW
 from .adamax import Adamax
 from .rmsprop import RMSProp
+from .adadelta import Adadelta
+from .sgd import SGD
+from .momentum import Momentum
 
 from . import lr_scheduler
 from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..bba2c11ea07490804573189bac8b315dfc80fd37
--- /dev/null
+++ b/python/paddle/optimizer/adadelta.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+
+__all__ = ["Adadelta"]
+
+
+class Adadelta(Optimizer):
+    """
+    **Notes: This API does not support sparse parameter optimization.**
+
+    Adadelta Optimizer. Please refer to this for details:
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.
+
+    The update is done as follows:
+
+    .. math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2
+
+        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) }
+
+        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
+
+    Args:
+	learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
+        rho (float): a floating point value indicating the decay rate. Default 0.95.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization. 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            adadelta = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            adadelta.step()
+            adadelta.clear_grad()
+
+    """
+
+    _avg_squared_grad_acc_str = "_avg_squared_grad"
+    _avg_squared_update_acc_str = "_avg_squared_update"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 epsilon=1.0e-6,
+                 rho=0.95,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        super(Adadelta, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "adadelta"
+        self._epsilon = epsilon
+        self._rho = rho
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._avg_squared_grad_acc_str, p)
+            self._add_accumulator(self._avg_squared_update_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        avg_squared_grad_acc = self._get_accumulator(
+            self._avg_squared_grad_acc_str, param_and_grad[0])
+        avg_squared_update_acc = self._get_accumulator(
+            self._avg_squared_update_acc_str, param_and_grad[0])
+
+        # Create the adadelta optimizer op
+        adadelta_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "AvgSquaredGrad": avg_squared_grad_acc,
+                "AvgSquaredUpdate": avg_squared_update_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "AvgSquaredGradOut": avg_squared_grad_acc,
+                "AvgSquaredUpdateOut": avg_squared_update_acc
+            },
+            attrs={"epsilon": self._epsilon,
+                   "rho": self._rho},
+            stop_gradient=True)
+
+        return adadelta_op
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
new file mode 100644
index 0000000000000000000000000000000000000000..87fa86c17615ef8cc455e95517608a246d677e74
--- /dev/null
+++ b/python/paddle/optimizer/momentum.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+
+__all__ = ["Momentum"]
+
+
+class Momentum(Optimizer):
+    """
+
+    Simple Momentum optimizer with velocity state
+
+    This optimizer has a flag for Nestrov Momentum.
+
+    The update equations are as follows:
+
+    .. math::
+
+        & velocity = mu * velocity + gradient
+
+        & if (use\_nesterov):
+
+        &\quad   param = param - (gradient + mu * velocity) * learning\_rate
+
+        & else:
+
+        &\quad   param = param - learning\_rate * velocity
+
+    Parameters:
+
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        momentum (float): Momentum factor. The default value is 0.9.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            momentum = paddle.optimizer.Momentum(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            momentum.step()
+            momentum.clear_grad()
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 momentum=0.9,
+                 parameters=None,
+                 use_nesterov=False,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set")
+        if momentum is None:
+            raise ValueError("momentum is not set")
+        super(Momentum, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "momentum"
+        self._momentum = momentum
+        self._use_nesterov = bool(use_nesterov)
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(self._velocity_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        lr = self._create_param_lr(param_and_grad)
+
+        if framework.in_dygraph_mode():
+            _, _ = core.ops.momentum(param_and_grad[0], param_and_grad[1],
+                                     velocity_acc, lr, param_and_grad[0],
+                                     velocity_acc, 'mu', self._momentum,
+                                     'use_nesterov', self._use_nesterov)
+            return None
+
+        attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov}
+        inputs = {
+            "Param": [param_and_grad[0]],
+            "Grad": [param_and_grad[1]],
+            "Velocity": [velocity_acc],
+            "LearningRate": [lr]
+        }
+
+        outputs = {
+            "ParamOut": [param_and_grad[0]],
+            "VelocityOut": [velocity_acc]
+        }
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True)
+
+        return momentum_op
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb3a578e15724e9501d69dc209bdedc65afeb82b
--- /dev/null
+++ b/python/paddle/optimizer/sgd.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+from ..fluid.dygraph import no_grad
+__all__ = ["SGD"]
+
+
+class SGD(Optimizer):
+    """
+    Optimizer of the stochastic gradient descent algorithm.
+
+    .. math::
+
+        param\_out = param - learning\_rate * grad
+
+    Parameters:
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` . 
+        
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            sgd.step()
+            sgd.clear_grad()
+
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set")
+        super(SGD, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "sgd"
+
+    @no_grad()
+    def _append_optimize_op(self, block, param_and_grad):
+        lr = self._create_param_lr(param_and_grad)
+        if framework.in_dygraph_mode():
+            core.ops.sgd(param_and_grad[0], lr, param_and_grad[1],
+                         param_and_grad[0])
+            return None
+
+        assert isinstance(block, framework.Block)
+        # create the optimize op
+        sgd_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": lr
+            },
+            outputs={"ParamOut": param_and_grad[0]},
+            stop_gradient=True)
+
+        return sgd_op
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index cb3caf0656e8fd4aba905feed92f10238d1fc9d0..9ef66712540aa54eac39b7e6160c5c91b6e3fcd5 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -71,22 +71,22 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     Args:
         data(scalar|tuple|list|ndarray|Tensor|ComplexTensor): Initial data for the tensor.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
-        dtype(str, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8'. And
-            'complex64' , 'complex128' only for ComplexTensor.
-            Default: None, infers data type from ``data`` .
+            'complex64' , 'complex128' only for ComplexTensor. Default: None, for float point number, 
+            get type from ``get_default_type``, for other type, infers from ``data`` .
         place(CPUPlace|CUDAPinnedPlace|CUDAPlace, optional): The place to allocate Tensor. Can be  
             CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
 
     Returns:
-        Tensor: A Tensor or ComplexTensor constructed from ``data``.
+        Tensor: A Tensor or ComplexTensor constructed from ``data`` .
 
     Raises:
         TypeError: If the data type of ``data`` is not scalar, list, tuple, numpy.ndarray, paddle.Tensor, paddle.ComplexTensor
         ValueError: If ``data`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]
         TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
-        ValueError: If ``place`` is not paddle.Place, paddle.CUDAPinnedPlace, paddle.CUDAPlace
+        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace
 
     Examples:
 
@@ -94,7 +94,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
         import paddle
         import numpy as np
-        paddle.enable_imperative()
+        paddle.disable_static()
                 
         type(paddle.to_tensor(1))
         # <class 'paddle.Tensor'>
@@ -132,7 +132,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
         #   - dtype: double
         #   - data: [0.1 0.2 0.3 0.4]
 
-        type(paddle.to_tensor([[1+1j, 2], [3+2j, 4]]), , dtype='complex64')
+        type(paddle.to_tensor([[1+1j, 2], [3+2j, 4]]), dtype='complex64')
         # <class 'paddle.ComplexTensor'>
 
         paddle.to_tensor([[1+1j, 2], [3+2j, 4]], dtype='complex64')
@@ -189,12 +189,13 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor|paddle.ComplexTensor".
                 format(type(data)))
 
-    if dtype:
-        dtype = convert_dtype(dtype)
-        if dtype != data.dtype:
-            data = data.astype(dtype)
-
     if not np.iscomplexobj(data):
+        if dtype:
+            dtype = convert_dtype(dtype)
+        elif data.dtype in ['float16', 'float32', 'float64']:
+            dtype = paddle.framework.get_default_dtype()
+        if dtype and dtype != data.dtype:
+            data = data.astype(dtype)
         return paddle.Tensor(
             value=data,
             place=place,
@@ -202,6 +203,14 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             zero_copy=True,
             stop_gradient=stop_gradient)
     else:
+        if dtype:
+            dtype = convert_dtype(dtype)
+        else:
+            dtype = paddle.framework.get_default_dtype()
+            dtype = 'complex64' if dtype in ['float16', 'float32'
+                                             ] else 'complex128'
+        if dtype != data.dtype:
+            data = data.astype(dtype)
         name = unique_name.generate('generated_tensor')
         real_tensor = paddle.Tensor(
             value=data.real,
diff --git a/tools/summary_env.py b/tools/summary_env.py
index 0252d9adcd07255e69a1abd81c7704eda02745b8..39d6acaf536c533a218d3d53b596c469ab19922d 100644
--- a/tools/summary_env.py
+++ b/tools/summary_env.py
@@ -55,7 +55,7 @@ def get_os_info():
     else:
         plat = None
         ver = None
-    envs['os_info'] = "{} {}".format(plat, ver)
+    envs['os_info'] = "{0} {1}".format(plat, ver)
 
 
 def get_python_info():
@@ -93,7 +93,7 @@ def get_cudnn_info():
         if cudnn_dll_path:
             cudnn_header_path = cudnn_dll_path.split('bin')[
                 0] + 'include\cudnn.h'
-            cmd = 'type "{}" | findstr "{}" | findstr /v "CUDNN_VERSION"'
+            cmd = 'type "{0}" | findstr "{1}" | findstr /v "CUDNN_VERSION"'
         else:
             envs['cudnn_version'] = None
             return
@@ -102,7 +102,7 @@ def get_cudnn_info():
             'whereis "cudnn.h" | awk \'{print $2}\'')
         if cudnn_header_path:
             cudnn_header_path = cudnn_header_path.strip()
-            cmd = 'cat "{}" | grep "{}" | grep -v "CUDNN_VERSION"'
+            cmd = 'cat "{0}" | grep "{1}" | grep -v "CUDNN_VERSION"'
         else:
             envs['cudnn_version'] = None
             return
@@ -112,7 +112,7 @@ def get_cudnn_info():
     patch_level = _get_cudnn_ver(
         cmd.format(cudnn_header_path, 'CUDNN_PATCHLEVEL'))
 
-    envs['cudnn_version'] = "{}.{}.{}".format(major, minor, patch_level)
+    envs['cudnn_version'] = "{0}.{1}.{2}".format(major, minor, patch_level)
 
 
 def get_driver_info():
@@ -132,7 +132,7 @@ def main():
     get_cuda_info()
     get_cudnn_info()
     get_driver_info()
-    print(envs_template.format(**envs))
+    print('*' * 40 + envs_template.format(**envs) + '*' * 40)
 
 
 if __name__ == '__main__':
diff --git a/tools/wlist.json b/tools/wlist.json
index ce6f5fb176b5baa66a480566d8aa884620c5332c..20f6a9cbaedb391995b3757612ec24f2061a8a81 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -247,6 +247,7 @@
         "prroi_pool"
     ],
     "wlist_temp":[
+        "to_tensor",
         "ChunkEvaluator",
         "EditDistance",
         "ErrorClipByValue",