diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index b541d73bc6a633d8e6a77ff567d756f3b40bfce9..8a655b2954dea5d6b864616ed2f4d19b167c4be8 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG 42ab4d559f6659edfc35040fb30fdcec3dc3f8aa)
+    set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
diff --git a/go/paddle/config.go b/go/paddle/config.go
index cea69e716bffada9e5565eacf8ac1af84ae5b930..c4f39fa9c5d627a689c064bbbd2178cd1ae1a929 100644
--- a/go/paddle/config.go
+++ b/go/paddle/config.go
@@ -154,10 +154,17 @@ func (config *AnalysisConfig) EnableMkldnnQuantizer() {
 	C.PD_EnableMkldnnQuantizer(config.c)
 }
 
+func (config *AnalysisConfig) EnableMkldnnBfloat16() {
+	C.PD_EnableMkldnnBfloat16(config.c)
+}
+
 func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool {
 	return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c))
 }
 
+func (config *AnalysisConfig) MkldnnBfloat16Enabled() bool {
+	return ConvertCBooleanToGo(C.PD_MkldnnBfloat16Enabled(config.c))
+}
 // SetModelBuffer
 // ModelFromMemory
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 10d2c2c6c9172ef2025d72e1723d74c8423aed1d..9d5c0cc7048f7db539c090d28c6184ac6d72d75a 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -119,7 +119,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_
 cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
         framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
 
-cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
+cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
 
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 84b5502ff7b369452e7c9988d185450934c78b03..29312370b3448bfe3c04b914ce0748eb1a66cf32 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -115,6 +115,7 @@ message VarType {
     SIZE_T = 19;
     UINT8 = 20;
     INT8 = 21;
+    BF16 = 22;
 
     // Other types that may need additional descriptions
     LOD_TENSOR = 7;
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index d00e38784c2c0415a59a33fc24d708c253481c21..9bde9e20b19a0b14ce4489b91d9ab3d5273f7f9a 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -12,67 +12,122 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/generator.h"
+
+#include <glog/logging.h>
+
 #include <deque>
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 
-#include "paddle/fluid/framework/generator.h"
-
 namespace paddle {
 namespace framework {
 
-std::shared_ptr<Generator> Generator::gen_instance_ = NULL;
+const std::shared_ptr<Generator>& DefaultCPUGenerator() {
+  static auto default_cpu_generator =
+      std::make_shared<Generator>(GetRandomSeed());
+  VLOG(4) << "initial seed: " << default_cpu_generator->GetCurrentSeed()
+          << ", cpu engine: " << default_cpu_generator->GetCPUEngine().get();
+  return default_cpu_generator;
+}
+
+std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine() {
+  static auto op_default_cpu_engine = std::make_shared<std::mt19937_64>();
+  return op_default_cpu_engine;
+}
+
+// NOTE(zhiqiu): there are 3 conditions:
+// (1) op seed is not set and DefaultCPUGenerator is inited, use
+// DefaultCPUGenerator
+// (2) op seed is not set and DefaultCPUGenerator is not inited, use se
+// OpDefaultCPUEngine() and set a radnom seed
+// (3) op seed is set, use OpDefaultCPUEngine() and set the seed
+std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
+  if (DefaultCPUGenerator()->GetIsInitPy() && seed == 0) {
+    VLOG(4) << "Use random engine from generator";
+    return DefaultCPUGenerator()->GetCPUEngine();
+  } else {
+    // NOTE(zhiqiu): creating an engine instance everytime instead of using
+    // OpDefaultCPUEngine(), this is the legacy behavior of random operators.
+    // The benefit is that when runing PE with fixed-seed in multiple thrads,
+    // each thread has their own engine, and doesn't affect each other.
+    //
+    // And we need to measure the determinacy of Generator in PE.
+    auto engine = std::make_shared<std::mt19937_64>();
+    if (seed == 0) {
+      seed = GetRandomSeed();
+      VLOG(4) << "Use default random engine with random seed = " << seed;
+    } else {
+      VLOG(4) << "Use default random engine with fixed random seed = " << seed;
+    }
+    static std::mutex mu_;
+    {
+      std::lock_guard<std::mutex> lock(mu_);
+      engine->seed(seed);
+    }
+    return engine;
+  }
+}
 
-GeneratorState* Generator::GetState() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_.get();
+GeneratorState Generator::GetState() {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  state_.cpu_engine = *engine_;
+  return this->state_;
 }
 
-void Generator::SetState(GeneratorState* state_in) {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  *this->state_ = *state_in;
+void Generator::SetState(const GeneratorState& state) {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  this->state_ = state;
+  this->engine_ = std::make_shared<std::mt19937_64>(state.cpu_engine);
 }
 
 uint64_t Generator::GetCurrentSeed() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_->current_seed;
+  std::lock_guard<std::mutex> lock(this->mu_);
+  return this->state_.current_seed;
 }
 
 uint64_t Generator::Seed() {
-  std::lock_guard<std::mutex> lock(this->mutex);
+  std::lock_guard<std::mutex> lock(this->mu_);
   uint64_t seed;
   std::random_device de;
   seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF;
-  this->state_->current_seed = seed;
+  this->state_.current_seed = seed;
   std::seed_seq seq({seed});
-  this->state_->cpu_engine.seed(seq);
+  this->engine_->seed(seq);
 
-  return this->state_->current_seed;
+  return this->state_.current_seed;
 }
 
 void Generator::SetCurrentSeed(uint64_t seed) {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  this->state_->current_seed = uint64_t(seed);
+  std::lock_guard<std::mutex> lock(this->mu_);
+  this->state_.current_seed = seed;
   std::seed_seq seq({seed});
-  this->state_->cpu_engine.seed(seq);
+  this->engine_->seed(seq);
 }
 
-std::mt19937_64& Generator::GetCPUEngine() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_->cpu_engine;
+std::shared_ptr<std::mt19937_64> Generator::GetCPUEngine() {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  return this->engine_;
 }
 
-void Generator::SetCPUEngine(std::mt19937_64 engine) {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  this->state_->cpu_engine = std::mt19937_64(engine);
+void Generator::SetCPUEngine(std::shared_ptr<std::mt19937_64> engine) {
+  std::lock_guard<std::mutex> lock(this->mu_);
+  this->engine_ = engine;
 }
 
 uint64_t Generator::Random64() {
-  std::lock_guard<std::mutex> lock(this->mutex);
-  return this->state_->cpu_engine();
+  std::lock_guard<std::mutex> lock(this->mu_);
+  auto engine = this->engine_;
+  return (*engine)();
+}
+
+void Generator::SetIsInitPy(bool is_init_py) {
+  this->is_init_py_ = is_init_py;
+  VLOG(4) << "SetIsInitPy:" << this->is_init_py_;
 }
+bool Generator::GetIsInitPy() const { return this->is_init_py_; }
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index 17870782ba72a3247de734642962ffec48c0c91e..82b35f7ad550e770e8d10457ddf6cdf8e6fbd709 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <glog/logging.h>
 #include <stdint.h>
+
 #include <atomic>
 #include <deque>
 #include <iostream>  // temp for debug
@@ -27,6 +29,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+static uint64_t GetRandomSeed() {
+  std::random_device rd;
+  // double has 53 bit significant, so limit uint64 to 53 bits
+  return ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
+}
+
 struct GeneratorState {
   int64_t device = -1;
   uint64_t current_seed = 34342423252;
@@ -35,62 +43,67 @@ struct GeneratorState {
 
 struct Generator {
   Generator() {
-    GeneratorState default_gen_state_cpu;
-    default_gen_state_cpu.device = -1;
-    default_gen_state_cpu.current_seed = 34342423252;
-    std::seed_seq seq({34342423252});
-    default_gen_state_cpu.cpu_engine = std::mt19937_64(seq);
-    this->state_ = std::make_shared<GeneratorState>(default_gen_state_cpu);
+    auto seed = GetRandomSeed();
+    std::seed_seq seq({seed});
+    auto engine = std::make_shared<std::mt19937_64>(seq);
+    this->state_.cpu_engine = *engine;
+    this->state_.device = -1;
+    this->state_.current_seed = seed;
+    this->engine_ = engine;
+    VLOG(4) << "initial seed: " << this->state_.current_seed
+            << ", cpu engine: " << &this->state_.cpu_engine;
+  }
+  explicit Generator(uint64_t seed) {
+    std::seed_seq seq({seed});
+    auto engine = std::make_shared<std::mt19937_64>(seq);
+    this->state_.cpu_engine = *engine;
+    this->state_.device = -1;
+    this->state_.current_seed = seed;
+    this->engine_ = engine;
+    VLOG(4) << "initial seed: " << this->state_.current_seed
+            << ", cpu engine: " << &this->state_.cpu_engine;
+    this->is_init_py_ = true;  // TODO(zhiqiu): remove it in future
   }
-  explicit Generator(GeneratorState state_in)
-      : state_{std::make_shared<GeneratorState>(state_in)} {}
-  Generator(const Generator& other)
-      : Generator(other, std::lock_guard<std::mutex>(other.mutex)) {}
+  Generator(const Generator& other) = delete;
 
   // get random state
-  GeneratorState* GetState();
+  GeneratorState GetState();
   // set random state
-  void SetState(GeneratorState* state_in);
+  void SetState(const GeneratorState&);
   // get current seed
   uint64_t GetCurrentSeed();
   // random a seed and get
   uint64_t Seed();
-
   // set seed
   void SetCurrentSeed(uint64_t seed);
   // get cpu engine
-  std::mt19937_64& GetCPUEngine();
+  std::shared_ptr<std::mt19937_64> GetCPUEngine();
   // set cpu engine
-  void SetCPUEngine(std::mt19937_64 engine);
+  void SetCPUEngine(std::shared_ptr<std::mt19937_64>);
 
   uint64_t Random64();
 
-  bool is_init_py = false;
+  void SetIsInitPy(bool);
+  bool GetIsInitPy() const;
 
-  // CPU Generator singleton
-  static std::shared_ptr<Generator> GetInstance() {
-    if (NULL == gen_instance_) {
-      gen_instance_.reset(new paddle::framework::Generator());
-    }
-    return gen_instance_;
-  }
+ private:
+  GeneratorState state_;
+  std::shared_ptr<std::mt19937_64> engine_;
+  mutable std::mutex mu_;
+
+  // NOTE(zhiqiu): is_init_py_ is used to make generator be compatible with
+  // old seed, and it should be removed after all random-related operators
+  // and unittests upgrades to use generator.
+  bool is_init_py_ = false;
+};
 
-  static std::shared_ptr<Generator> GetInstanceX() {
-    if (NULL == gen_instance_) {
-      gen_instance_.reset(new paddle::framework::Generator());
-    }
-    gen_instance_->is_init_py = true;
-    return gen_instance_;
-  }
+// The DefaultCPUGenerator is used in manual_seed()
+const std::shared_ptr<Generator>& DefaultCPUGenerator();
 
- private:
-  static std::shared_ptr<Generator> gen_instance_;
-  std::shared_ptr<GeneratorState> state_;
-  mutable std::mutex mutex;
+// If op seed is set or global is not set, the OpDefaultCPUEngine is used.
+std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine();
 
-  Generator(const Generator& other, const std::lock_guard<std::mutex>&)
-      : state_(std::make_shared<GeneratorState>(*(other.state_))) {}
-};
+std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index 079fb1479861ca0840b47470339f2f7a5b6bffa8..b50b4f37caecd8d8d5c393ee3a5c5b76c1f406be 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
+#include <cmath>
 #include <functional>
 #include <string>
 #include <vector>
@@ -74,12 +75,17 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
   auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
   auto weights_shape = weights->dims();
   auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
+  auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
 
-  EigenMatrixArrayMap weights_array_2d(
-      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
-      weights_shape_2d[1]);
+  EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
+                                       weights_shape_2d[1]);
 
   weights_array_2d.colwise() *= scale_array;
+
+  // Check for subnormal values that slows down convolution execution
+  for (int i = 0; i < weights->numel(); ++i) {
+    if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
+  }
 }
 
 void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
@@ -108,13 +114,6 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
 
     GET_CONV_BN_NODES(conv_ac_pattern);
 
-    // check if fuse can be done and if MKL-DNN should be used
-    FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
-    if (fuse_option == DO_NOT_FUSE) {
-      VLOG(3) << "do not perform conv+affinechannel fuse";
-      return;
-    }
-
     // Create eltwise_y (conv bias) variable
     VarDesc eltwise_y_in_desc(
         patterns::PDNodeName(name_scope_, "eltwise_y_in"));
@@ -143,6 +142,7 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
     desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
     desc.SetType("elementwise_add");
     desc.SetAttr("axis", 1);
+    desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
     auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
 
     GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index ff6dffa704eeceeabfc5eb1d6786f40b2e523e98..3d65fe595373fa98ba237f04134c75d4a60a7242 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1879,6 +1879,19 @@ PDNode *patterns::MultipleQuantize::operator()() {
   return prev_out;
 }
 
+PDNode *patterns::QuantizePlacement::operator()(
+    const std::unordered_set<std::string> &quantize_enabled_op_types) {
+  std::unordered_set<std::string> supported_op_types =
+      std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
+                                       "fc", "matmul", "pool2d", "prior_box",
+                                       "relu", "reshape2", "transpose2"});
+  if (!quantize_enabled_op_types.empty()) {
+    supported_op_types = quantize_enabled_op_types;
+  }
+  auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
+  return op;
+}
+
 PDNode *patterns::MKLDNNInPlace::operator()() {
   const std::unordered_set<std::string> &supported_op_types = {
       "abs",
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index e1cce7848dd54b02a540b144ca1088f62eeb52cb..0803265884165bc754489b18d07c0d277a4bd92b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1120,6 +1120,15 @@ struct MultipleQuantize : public PatternBase {
   PATTERN_DECL_NODE(prev_out);
 };
 
+struct QuantizePlacement : public PatternBase {
+  QuantizePlacement(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "quantize_placement") {}
+  PDNode* operator()(
+      const std::unordered_set<std::string>& quantize_enabled_op_types);
+
+  PATTERN_DECL_NODE(op);
+};
+
 // Pattern used for enforcing inplace computation for in-place computation
 // supporting DNNL ops. softmax, batch_norm and layer_norm
 struct MKLDNNInPlace : public PatternBase {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 0644cf9bb6575462d2d8362713a4720d2684bf8d..bc268a834780cad843a18a74bb7f50a639db103d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -26,27 +26,33 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
       Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
-  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp()) {
-      if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
-                    n->id()) != excluded_ids_list.end())
-        continue;
-      auto* op = n->Op();
-      if (op->HasAttr("mkldnn_data_type") ||
-          op->HasProtoAttr("mkldnn_data_type")) {
-        // use_quantizer is no longer used
-        // assign value for compatibility
-        if (op->GetAttrIfExists<bool>("use_quantizer")) {
-          op->SetAttr("mkldnn_data_type", std::string("int8"));
-        }
-        if (std::find(op_types_list.begin(), op_types_list.end(), op->Type()) !=
-            op_types_list.end()) {
-          op->SetAttr("mkldnn_data_type", std::string("int8"));
-          op->SetAttr("use_quantizer", true);
-        }
+  Init(name_scope_, graph);
+  GraphPatternDetector gpd;
+  patterns::QuantizePlacement quantize_placement_pattern{gpd.mutable_pattern(),
+                                                         "quantize_placement"};
+  quantize_placement_pattern(op_types_list);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, quantize_placement_pattern);
+
+    if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
+                  op->id()) != excluded_ids_list.end()) {
+      return;
+    }
+
+    if (op->Op()->HasAttr("mkldnn_data_type") ||
+        op->Op()->HasProtoAttr("mkldnn_data_type")) {
+      // use_quantizer is no longer used
+      // assign value for compatibility
+      if (op->Op()->GetAttrIfExists<bool>("use_quantizer")) {
+        op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
       }
+      op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
+      op->Op()->SetAttr("use_quantizer", true);
     }
-  }
+  };
+  gpd(graph, handler);
 }
 
 }  // namespace ir
@@ -58,10 +64,7 @@ REGISTER_PASS(cpu_quantize_placement_pass,
     // a vector of operator type names to be quantized ("conv2d" etc.)
     // the second param is the default value for this vector
     .DefaultPassAttr("quantize_enabled_op_types",
-                     new std::unordered_set<std::string>(
-                         {"concat", "conv2d", "elementwise_add", "fc", "matmul",
-                          "pool2d", "prior_box", "relu", "reshape2",
-                          "transpose2"}))
+                     new std::unordered_set<std::string>())
     // a vector of operator ids that are to be excluded from quantization
     // the second param is the default value for this vector
     .DefaultPassAttr("quantize_excluded_op_ids", new std::unordered_set<int>());
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
index 008a462dc414c04f53315a8f262de15ab8fb7fb5..f3229e59d6ffb97514adb9c871d4fb981fc964e0 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
@@ -15,7 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
-#include "paddle/fluid/framework/ir/pass.h"
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
 namespace framework {
@@ -23,9 +26,10 @@ namespace ir {
 /*
  * Specifies which operators should be quantized.
  */
-class CPUQuantizePlacementPass : public Pass {
+class CPUQuantizePlacementPass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
+  const std::string name_scope_{"cpu_quantize_placement_pass"};
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
index 6977a9495853f9aa9a0680cafc51a170b848bb37..761defc25ff5c89b740ccd5adff7d613beccd9d4 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -131,8 +131,8 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
 }
 
 TEST(QuantizerPlacementPass, empty_list) {
-  // no operator quantized
-  MainTest({}, {}, 0);
+  // all operators quantized
+  MainTest({}, {}, 6);
 }
 
 TEST(QuantizerPlacementPass, default_attr_value) {
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 4506c162fa743a3fcb5973a9f0ebd9e8f6cdcd36..56ae02d49ef522fbf243d8dbc62ee319cbba425b 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -81,7 +81,8 @@ void DeleteQuant(ir::Graph* graph, Scope* scope,
       if (quantized_op_type == "conv2d" ||
           quantized_op_type == "conv2d_fusion" ||
           quantized_op_type == "depthwise_conv2d" ||
-          quantized_op_type == "fc") {
+          quantized_op_type == "fc" ||
+          quantized_op_type == "conv2d_transpose") {
         op_desc->SetAttr("Input_scale", scale_value);
       } else if (quantized_op_type == "mul") {
         op_desc->SetAttr("X_scale", scale_value);
@@ -111,7 +112,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
   std::string input_name = "";
   if (quantized_op_type == "conv2d" ||
       quantized_op_type == "depthwise_conv2d" ||
-      quantized_op_type == "conv2d_fusion") {
+      quantized_op_type == "conv2d_fusion" ||
+      quantized_op_type == "conv2d_transpose") {
     weight_name = "Filter";
     input_name = "Input";
   } else if (quantized_op_type == "mul") {
@@ -122,7 +124,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
     input_name = "Input";
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
-        "QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for "
+        "QuantDequantFuse: We only support conv2d, conv2d_fusion, "
+        "conv2d_transpose, fc, mul for "
         "now."));
   }
   const std::string pattern_name = "dequant_fuse";
@@ -192,10 +195,12 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
         scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>();
     auto w_dims = weight_tensor->dims();
     // If quantized op is fc, weight scale size = 1;
-    // If quantized op is conv, weight scale size = weight dims[0]
+    // If quantized op is conv2d, weight scale size = weight dims[0]
+    // If quantized op is conv2d_transpose, weight scale size = weight dims[1]
     bool valid_scale_size =
         (weight_scale.size() == 1 ||
-         weight_scale.size() == static_cast<size_t>(w_dims[0]));
+         weight_scale.size() == static_cast<size_t>(w_dims[0]) ||
+         weight_scale.size() == static_cast<size_t>(w_dims[1]));
     PADDLE_ENFORCE_EQ(
         valid_scale_size, true,
         platform::errors::InvalidArgument(
@@ -206,8 +211,14 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
       if (weight_scale.size() == 1) {
         quantized_weight_data[j] *= weight_scale[0];
       } else {
-        int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
-        quantized_weight_data[j] *= weight_scale[j / inner_size];
+        if (quantized_op_type == "conv2d_transpose") {
+          int inner_size = w_dims[2] * w_dims[3];
+          quantized_weight_data[j] *=
+              weight_scale[(j / inner_size) % w_dims[1]];
+        } else {
+          int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
+          quantized_weight_data[j] *= weight_scale[j / inner_size];
+        }
       }
     }
 
@@ -220,7 +231,8 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
     new_op_desc.SetType(quantized_op_type);
     new_op_desc.SetAttr("enable_int8", true);
     if (quantized_op_type == "conv2d" || quantized_op_type == "conv2d_fusion" ||
-        quantized_op_type == "depthwise_conv2d") {
+        quantized_op_type == "depthwise_conv2d" ||
+        quantized_op_type == "conv2d_transpose") {
       new_op_desc.SetInput("Input", {new_input});
       new_op_desc.SetOutput("Output", {new_output});
     } else if (quantized_op_type == "fc") {
@@ -253,7 +265,7 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
   std::unordered_set<std::string> quant_types = {
       "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
   std::unordered_set<std::string> quantized_op_types = {
-      "conv2d", "mul", "depthwise_conv2d", "fc"};
+      "conv2d", "mul", "depthwise_conv2d", "fc", "conv2d_transpose"};
   auto* scope = param_scope();
 
   for (auto& quant_type : quant_types) {
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index 2a85c60305bd36e78c071f5703885c23e33b403e..79b15fc87d0b0a0ade8324710b80af634ff8878f 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -34,7 +34,8 @@ struct OpUpdateRecord {
     kModifyAttr,
     kNewAttr,
     kNewInput,
-    kNewOutput
+    kNewOutput,
+    kBugfixWithBehaviorChanged,
   };
   Type type_;
   std::string remark_;
@@ -82,6 +83,11 @@ struct NewOutput : OpUpdateRecord {
   std::string name_;
 };
 
+struct BugfixWithBehaviorChanged : OpUpdateRecord {
+  explicit BugfixWithBehaviorChanged(const std::string& remark)
+      : OpUpdateRecord({Type::kBugfixWithBehaviorChanged, remark}) {}
+};
+
 class OpVersionDesc {
  public:
   OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
@@ -110,6 +116,12 @@ class OpVersionDesc {
     return *this;
   }
 
+  OpVersionDesc& BugfixWithBehaviorChanged(const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::BugfixWithBehaviorChanged(remark)));
+    return *this;
+  }
+
  private:
   std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
 };
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index 052bf3a4b882be749e70704f18f09a7b24551ed7..80ad51ad07b5a84cfabb3ace9b478b1f6ea24f95 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -23,6 +23,10 @@ namespace compatible {
 
 TEST(test_operator_version, test_operator_version) {
   REGISTER_OP_VERSION(test__)
+      .AddCheckpoint(
+          R"ROC(Fix the bug of reshape op, support the case of axis < 0)ROC",
+          framework::compatible::OpVersionDesc().BugfixWithBehaviorChanged(
+              "Support the case of axis < 0"))
       .AddCheckpoint(
           R"ROC(
         Upgrade reshape, modified one attribute [axis] and add a new attribute [size].
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 3b3271fc5b936e65b60930f43ea5c4f6f8448941..c3626c5c9e0506f12ca77aac5086cb18e272a771 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -913,10 +913,20 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
   auto element_num = tensor.numel();
 
   os << "  - data: [";
-  if (element_num > 0) {
-    os << inspect[0];
-    for (int j = 1; j < element_num; ++j) {
-      os << " " << inspect[j];
+  // Note: int8_t && uint8_t is typedf of char, ostream unable to print properly
+  if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
+    if (element_num > 0) {
+      os << signed(inspect[0]);
+      for (int j = 1; j < element_num; ++j) {
+        os << " " << signed(inspect[j]);
+      }
+    }
+  } else {
+    if (element_num > 0) {
+      os << inspect[0];
+      for (int j = 1; j < element_num; ++j) {
+        os << " " << inspect[j];
+      }
     }
   }
   os << "]";
diff --git a/paddle/fluid/imperative/backward_strategy.h b/paddle/fluid/imperative/backward_strategy.h
deleted file mode 100644
index 0f04d6db8e63d5d069745ed1895df774e69d60d0..0000000000000000000000000000000000000000
--- a/paddle/fluid/imperative/backward_strategy.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// Created by Jiabin on 2019-04-25.
-//
-#pragma once
-
-namespace paddle {
-namespace imperative {
-namespace detail {
-
-struct BackwardStrategy {
-  /* DyGraph now support two kinds of backward strategy, one is sorted sum
-   * gradient, another is sum gradient once they are created */
-  // TODO(jiabin): add more Strategy when we support
-  bool sorted_sum_gradient_{false};
-};
-
-}  // namespace detail
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index de1246883f1019bc3e6adabadbc9e071926eb772..9ad30506b2c3a0fac16d29c3bbee07725ff3d95d 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -30,12 +30,12 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DECLARE_bool(sort_sum_gradient);
+
 namespace paddle {
 namespace imperative {
 
-void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy,
-                       bool retain_graph) {
-  backward_strategy_ = strategy;
+void BasicEngine::Init(VarBase* var, bool retain_graph) {
   retain_graph_ = retain_graph;
   init_node_ = var->GradVarBase()->GradNode();
   var->GradVarBase()->ClearGradNode();
@@ -105,7 +105,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) {
 
       auto& accumulator = accumulators_[var.get()];
       if (!accumulator) {
-        if (backward_strategy_.sorted_sum_gradient_) {
+        if (FLAGS_sort_sum_gradient) {
           accumulator.reset(new SortedGradientAccumulator(var.get()));
         } else {
           accumulator.reset(new EagerGradientAccumulator(var.get()));
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index 4d25d81235098cca37491b1d8e43b481adc2fd0a..0906dd4f9236ecf26ef30395aa551b57e4e43b75 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -18,7 +18,6 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 
@@ -30,8 +29,7 @@ class OpBase;
 
 class BasicEngine : public Engine {
  public:
-  void Init(VarBase* var, const detail::BackwardStrategy& strategy,
-            bool retain_graph = false);
+  void Init(VarBase* var, bool retain_graph = false);
 
   void Execute() override;
 
@@ -46,7 +44,6 @@ class BasicEngine : public Engine {
 
  private:
   std::shared_ptr<GradOpNode> init_node_;
-  detail::BackwardStrategy backward_strategy_;
   std::unordered_map<GradOpNode*, size_t> node_deps_;
   std::unordered_map<VariableWrapper*, std::unique_ptr<GradientAccumulator>>
       accumulators_;
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 4f133bf80c7904d9b6a84c933d431c2820b999e4..5c717835e5cc2042a7a3fdd8c51aa6eeff1fc523 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -33,6 +33,8 @@
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
+DECLARE_bool(sort_sum_gradient);
+
 namespace paddle {
 namespace imperative {
 
@@ -529,8 +531,7 @@ class PartialGradTask {
                   const std::vector<std::shared_ptr<VarBase>> &output_targets,
                   const std::vector<std::shared_ptr<VarBase>> &output_grads,
                   const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-                  const platform::Place &place,
-                  const detail::BackwardStrategy &strategy, bool create_graph,
+                  const platform::Place &place, bool create_graph,
                   bool retain_graph, bool allow_unused, bool only_inputs);
 
   std::vector<std::shared_ptr<VarBase>> Run();
@@ -577,7 +578,6 @@ class PartialGradTask {
   bool retain_graph_;
   bool allow_unused_;
   bool only_inputs_;
-  detail::BackwardStrategy strategy_;
 };
 
 PartialGradTask::PartialGradTask(
@@ -585,15 +585,14 @@ PartialGradTask::PartialGradTask(
     const std::vector<std::shared_ptr<VarBase>> &output_targets,
     const std::vector<std::shared_ptr<VarBase>> &output_grads,
     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-    const platform::Place &place, const detail::BackwardStrategy &strategy,
-    bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) {
+    const platform::Place &place, bool create_graph, bool retain_graph,
+    bool allow_unused, bool only_inputs) {
   input_targets_ = input_targets;
   place_ = place;
   create_graph_ = create_graph;
   retain_graph_ = retain_graph;
   allow_unused_ = allow_unused;
   only_inputs_ = only_inputs;
-  strategy_ = strategy;
 
   PADDLE_ENFORCE_EQ(only_inputs_, true,
                     platform::errors::Unimplemented(
@@ -981,7 +980,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) {
 
       if (!accumulator) {
         accumulator.reset(new GradientAccumulationInfo(
-            var, strategy_.sorted_sum_gradient_, create_graph_));
+            var, FLAGS_sort_sum_gradient, create_graph_));
       }
 
       accumulator->IncreaseTotalRefCnt();
@@ -1033,11 +1032,11 @@ PartialGradEngine::PartialGradEngine(
     const std::vector<std::shared_ptr<VarBase>> &output_targets,
     const std::vector<std::shared_ptr<VarBase>> &output_grads,
     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-    const platform::Place &place, const detail::BackwardStrategy &strategy,
-    bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs)
+    const platform::Place &place, bool create_graph, bool retain_graph,
+    bool allow_unused, bool only_inputs)
     : task_(new PartialGradTask(input_targets, output_targets, output_grads,
-                                no_grad_vars, place, strategy, create_graph,
-                                retain_graph, allow_unused, only_inputs)) {}
+                                no_grad_vars, place, create_graph, retain_graph,
+                                allow_unused, only_inputs)) {}
 
 PartialGradEngine::~PartialGradEngine() { Clear(); }
 
diff --git a/paddle/fluid/imperative/partial_grad_engine.h b/paddle/fluid/imperative/partial_grad_engine.h
index a7f28c49ec3950674cd43127f51934089a497412..b5da39f8d4237130fd4674eacb479aaf6b9ba348 100644
--- a/paddle/fluid/imperative/partial_grad_engine.h
+++ b/paddle/fluid/imperative/partial_grad_engine.h
@@ -16,7 +16,6 @@
 
 #include <memory>
 #include <vector>
-#include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -33,8 +32,7 @@ class PartialGradEngine : public Engine {
                     const std::vector<std::shared_ptr<VarBase>> &output_targets,
                     const std::vector<std::shared_ptr<VarBase>> &output_grads,
                     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-                    const platform::Place &place,
-                    const detail::BackwardStrategy &strategy, bool create_graph,
+                    const platform::Place &place, bool create_graph,
                     bool retain_graph, bool allow_unused, bool only_inputs);
 
   ~PartialGradEngine();
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 3c3ec2e6263396881597649d3ab643b5492d630a..892acffb712d9734e525a403881fda47ca0df23a 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -240,9 +240,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   framework::AttributeMap reduce_attr_map;
   tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
                  gpu_place, true);
-  detail::BackwardStrategy back_st;
   imperative::BasicEngine engine;
-  engine.Init(reduce_sum_out.get(), back_st);
+  engine.Init(reduce_sum_out.get());
   engine.Execute();
 
   framework::LoDTensor rlt;
@@ -356,9 +355,8 @@ TEST(test_tracer, test_var_without_grad_var) {
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
 
-  detail::BackwardStrategy back_st;
   imperative::BasicEngine engine;
-  engine.Init(vout.get(), back_st);
+  engine.Init(vout.get());
   engine.Execute();
 
   // check the grad
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 61886c225e6548413e6e2eb0415f596d016a988f..9fbc97d55090345af3b3b12bcd138bfaecd346cc 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
@@ -103,8 +102,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
                                   // params_file_ fields.
 
   CP_MEMBER(opt_cache_dir_);
-  prog_file_ = std::move(other.prog_file_);
-  params_file_ = std::move(other.params_file_);
+  CP_MEMBER(prog_file_);
+  CP_MEMBER(params_file_);
 
   CP_MEMBER(use_fc_padding_);
   // GPU related.
@@ -218,6 +217,17 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
   Update();
 }
 
+void AnalysisConfig::EnableMkldnnBfloat16() {
+#ifdef PADDLE_WITH_MKLDNN
+  use_mkldnn_bfloat16_ = true;
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnBfloat16";
+  use_mkldnn_bfloat16_ = false;
+#endif
+
+  Update();
+}
+
 MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
   PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
                           "MkldnnQuantizer was not enabled yet.");
@@ -331,6 +341,12 @@ void AnalysisConfig::Update() {
 #endif
   }
 
+  if (use_mkldnn_bfloat16_) {
+#ifdef PADDLE_WITH_MKLDNN
+    pass_builder()->EnableMkldnnBfloat16();
+#endif
+  }
+
 #ifdef PADDLE_WITH_MKLDNN
   // Do not optimize when mkldnn is on
   if (enable_memory_optim_ && !use_mkldnn_) {
@@ -399,6 +415,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << ";";
 
   ss << use_mkldnn_quantizer_;
+  ss << use_mkldnn_bfloat16_;
   ss << model_from_memory_;
 
   ss << with_profile_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index a8c8058c6b714dcd6f283c35b50bef55446e62bb..127a41aee890808258367fb40804a9547b8fdbb0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -32,7 +32,6 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -517,6 +516,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
+  // TODO(NHZlX): Should add the link to the doc of
+  // paddle_infer::CreatePredictor<paddle_infer::Config>
   if (config.glog_info_disabled()) {
     FLAGS_logtostderr = 1;
     FLAGS_minloglevel = 2;  // GLOG_ERROR
@@ -1058,3 +1059,122 @@ USE_TRT_CONVERTER(skip_layernorm);
 USE_TRT_CONVERTER(slice);
 USE_TRT_CONVERTER(scale);
 #endif
+
+namespace paddle_infer {
+
+void Tensor::Reshape(const std::vector<int> &shape) { tensor_->Reshape(shape); }
+
+std::vector<int> Tensor::shape() const { return tensor_->shape(); }
+
+void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+  return tensor_->SetLoD(x);
+}
+
+std::vector<std::vector<size_t>> Tensor::lod() const { return tensor_->lod(); }
+
+const std::string &Tensor::name() const { return tensor_->name(); }
+
+DataType Tensor::type() const { return tensor_->type(); }
+
+Predictor::Predictor(const Config &config) {
+  const_cast<Config *>(&config)->SwitchUseFeedFetchOps(false);
+  // The second parameter indicates that the discard log is not printed
+  predictor_ = paddle::CreatePaddlePredictor<
+      Config, paddle::PaddleEngineKind::kAnalysis>(config);
+}
+
+std::vector<std::string> Predictor::GetInputNames() {
+  return predictor_->GetInputNames();
+}
+
+std::unique_ptr<Tensor> Predictor::GetInputHandle(const std::string &name) {
+  auto zero_copy_tensor = predictor_->GetInputTensor(name);
+  std::unique_ptr<Tensor> tensor(new Tensor(std::move(zero_copy_tensor)));
+  return tensor;
+}
+
+std::vector<std::string> Predictor::GetOutputNames() {
+  return predictor_->GetOutputNames();
+}
+
+std::unique_ptr<Tensor> Predictor::GetOutputHandle(const std::string &name) {
+  auto zero_copy_tensor = predictor_->GetOutputTensor(name);
+  std::unique_ptr<Tensor> tensor(new Tensor(std::move(zero_copy_tensor)));
+  return tensor;
+}
+
+bool Predictor::Run() { return predictor_->ZeroCopyRun(); }
+
+std::unique_ptr<Predictor> Predictor::Clone() {
+  auto analysis_pred = predictor_->Clone();
+  std::unique_ptr<Predictor> pred(new Predictor(std::move(analysis_pred)));
+  return pred;
+}
+
+void Predictor::ClearIntermediateTensor() {
+  predictor_->ClearIntermediateTensor();
+}
+
+int GetNumBytesOfDataType(DataType dtype) {
+  switch (dtype) {
+    case DataType::FLOAT32:
+      return sizeof(float);
+    case DataType::INT64:
+      return sizeof(int64_t);
+    case DataType::INT32:
+      return sizeof(int32_t);
+    case DataType::UINT8:
+      return sizeof(uint8_t);
+    default:
+      assert(false);
+      return -1;
+  }
+}
+
+std::string GetVersion() { return paddle::get_version(); }
+
+std::string UpdateDllFlag(const char *name, const char *value) {
+  return paddle::UpdateDllFlag(name, value);
+}
+
+}  // namespace paddle_infer
+
+namespace paddle_infer {
+std::shared_ptr<Predictor> CreatePredictor(const Config &config) {  // NOLINT
+  std::shared_ptr<Predictor> predictor(new Predictor(config));
+  return predictor;
+}
+
+namespace services {
+PredictorPool::PredictorPool(const Config &config, size_t size) {
+  PADDLE_ENFORCE_GE(
+      size, 1UL,
+      paddle::platform::errors::InvalidArgument(
+          "The predictor pool size should be greater than 1, but it's (%d)",
+          size));
+  Config copy_config(config);
+  main_pred_.reset(new Predictor(config));
+  for (size_t i = 0; i < size - 1; i++) {
+    if (config.tensorrt_engine_enabled()) {
+      Config config_tmp(copy_config);
+      preds_.push_back(
+          std::move(std::unique_ptr<Predictor>(new Predictor(config_tmp))));
+    } else {
+      preds_.push_back(std::move(main_pred_->Clone()));
+    }
+  }
+}
+
+Predictor *PredictorPool::Retrive(size_t idx) {
+  PADDLE_ENFORCE_LT(
+      idx, preds_.size() + 1,
+      paddle::platform::errors::InvalidArgument(
+          "There are (%d) predictors in the pool, but the idx is (%d)", idx,
+          preds_.size() + 1));
+  if (idx == 0) {
+    return main_pred_.get();
+  }
+  return preds_[idx - 1].get();
+}
+}  // namespace services
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index dea448f9b03468eabda16d4375ea60348a09efb2..5766919f08e68832886b88b867bc48afa288a955 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -485,4 +485,25 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
 }
 #endif
 
+#ifdef PADDLE_WITH_CUDA
+TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim(true);
+  config.EnableUseGpu(100, 0);
+  config.EnableMkldnnBfloat16();
+#ifdef PADDLE_WITH_MKLDNN
+  ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
+#else
+  ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);
+#endif
+}
+#endif
+
+TEST(AnalysisPredictor, bf16_pass_strategy) {
+  std::vector<std::string> passes;
+  PassStrategy passStrategy(passes);
+  passStrategy.EnableMkldnnBfloat16();
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 458eecfeea6ff27c96a8864ba8a08a9e5c587df5..2f608da531f25e1a5665744f7e9a2968cc9d0d64 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -112,6 +112,12 @@ void PaddleBuf::Free() {
   }
 }
 
+NativeConfig::NativeConfig() {
+  LOG(WARNING) << "The paddle::NativeConfig interface is going to be "
+                  "deprecated in the next release, plase use the latest "
+                  "paddle_infer::Config instead.";
+}
+
 std::string get_version() {
   std::stringstream ss;
   ss << "version: " << framework::paddle_version() << "\n";
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 3d5b40c93dad071052217677e387ba54011fb666..07d6dcf86e9814e5bfc932d8320b549d55fe88ae 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <set>
 #include <sstream>
 #include <string>
@@ -25,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -311,6 +313,8 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
+  // TODO(NHZlX): Should add the link to the doc of
+  // paddle_infer::CreatePredictor<paddle_infer::Config>
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memory
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 6a31ff281c68e3675d35c14059a453455ef398df..b1244e4e3dfdd5e6a627054250e6def2a7c35a89 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -401,6 +401,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableMkldnnQuantizer();
 
+  ///
+  /// \brief Turn on MKLDNN bfloat16.
+  ///
+  ///
+  void EnableMkldnnBfloat16();
+
+  ///
+  /// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+  ///
+  /// \return bool Whether to use the MKLDNN Bfloat16.
+  ///
+  bool mkldnn_bfloat16_enabled() const { return use_mkldnn_bfloat16_; }
+
   ///
   /// \brief A boolean state telling whether the thread local CUDA stream is
   /// enabled.
@@ -592,6 +605,7 @@ struct PD_INFER_DECL AnalysisConfig {
   int mkldnn_cache_capacity_{0};
   bool use_mkldnn_quantizer_{false};
   std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
+  bool use_mkldnn_bfloat16_{false};
 
   // If the config is already used on a predictor, it becomes invalid.
   // Any config can only be used with one predictor.
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 386d20103a71acb34cd47ddf5527f580cc5bf5b1..064f63542683a0d95985382385b182d794da0068 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -347,6 +347,7 @@ class PD_INFER_DECL PaddlePredictor {
 /// place of inference, etc.)
 ///
 struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
+  NativeConfig();
   /// GPU related fields.
   bool use_gpu{false};
   int device{0};
@@ -421,7 +422,8 @@ enum class PaddleEngineKind {
 };
 
 template <typename ConfigT, PaddleEngineKind engine>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
+    const ConfigT& config);
 
 template <>
 PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
@@ -437,6 +439,4 @@ PD_INFER_DECL std::string get_version();
 
 PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
 
-PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
-    const std::string& config_file);
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 6f30ad95f168cebe9702c90fbd2cca2c79a0e83f..da5d7411693c92eaa2066c7f76d56970f8939bc7 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -22,9 +22,124 @@ limitations under the License. */
 #pragma once
 
 #include <cassert>
+#include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "paddle_analysis_config.h"  // NOLINT
 #include "paddle_api.h"              // NOLINT
+
+namespace paddle_infer {
+using DataType = paddle::PaddleDType;
+using PlaceType = paddle::PaddlePlace;
+using PrecisionType = paddle::AnalysisConfig::Precision;
+using Config = paddle::AnalysisConfig;
+
+class PD_INFER_DECL Tensor {
+ public:
+  // Can only be created by predictor->GetInputHandle(cosnt std::string& name)
+  // or predictor->GetOutputHandle(cosnt std::string& name)
+  Tensor() = delete;
+  explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
+      : tensor_(std::move(tensor)) {}
+  void Reshape(const std::vector<int>& shape);
+
+  template <typename T>
+  void CopyFromCpu(const T* data);
+
+  // should add the place
+  template <typename T>
+  T* mutable_data(PlaceType place);
+
+  template <typename T>
+  void CopyToCpu(T* data);
+
+  template <typename T>
+  T* data(PlaceType* place, int* size) const;
+
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  std::vector<std::vector<size_t>> lod() const;
+
+  DataType type() const;
+
+  std::vector<int> shape() const;
+  const std::string& name() const;
+
+ private:
+  std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
+};
+
+class PD_INFER_DECL Predictor {
+ public:
+  Predictor() = default;
+  ~Predictor() {}
+  // Use for clone
+  explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
+      : predictor_(std::move(pred)) {}
+
+  explicit Predictor(const Config& config);
+
+  std::vector<std::string> GetInputNames();
+  std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
+
+  bool Run();
+
+  std::vector<std::string> GetOutputNames();
+  std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
+
+  std::unique_ptr<Predictor> Clone();
+  void ClearIntermediateTensor();
+
+ private:
+  std::unique_ptr<paddle::PaddlePredictor> predictor_;
+};
+
+PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
+    const Config& config);  // NOLINT
+PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
+
+PD_INFER_DECL std::string GetVersion();
+PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
+
+template <typename T>
+void Tensor::CopyFromCpu(const T* data) {
+  tensor_->copy_from_cpu<T>(data);
+}
+
+template <typename T>
+void Tensor::CopyToCpu(T* data) {
+  return tensor_->copy_to_cpu<T>(data);
+}
+
+template <typename T>
+T* Tensor::mutable_data(PlaceType place) {
+  return tensor_->mutable_data<T>(place);
+}
+
+template <typename T>
+T* Tensor::data(PlaceType* place, int* size) const {
+  return tensor_->data<T>(place, size);
+}
+
+}  // namespace paddle_infer
+
+namespace paddle_infer {
+namespace services {
+
+class PD_INFER_DECL PredictorPool {
+ public:
+  PredictorPool() = delete;
+  PredictorPool(const PredictorPool&) = delete;
+  PredictorPool& operator=(const PredictorPool&) = delete;
+
+  explicit PredictorPool(const Config& config, size_t size = 1);
+  Predictor* Retrive(size_t idx);
+
+ private:
+  std::shared_ptr<Predictor> main_pred_;
+  std::vector<std::unique_ptr<Predictor>> preds_;
+};
+}  // namespace services
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index c07ac11e278901e9b9475492ca38411dcf8184d3..98a36a3308dc539ee5aecad9e71f50be310e584c 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -143,6 +143,10 @@ void GpuPassStrategy::EnableMkldnnQuantizer() {
   LOG(ERROR) << "GPU not support MKL-DNN quantization";
 }
 
+void GpuPassStrategy::EnableMkldnnBfloat16() {
+  LOG(ERROR) << "GPU not support MKL-DNN bfloat16";
+}
+
 CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   // NOTE the large fusions should be located in the front, so that they will
   // not be damaged by smaller ones.
@@ -181,12 +185,14 @@ void CpuPassStrategy::EnableMKLDNN() {
     passes_.insert(passes_.begin(), "mkldnn_placement_pass");
 
     for (auto &pass : std::vector<std::string>({
-             "depthwise_conv_mkldnn_pass",    //
-             "conv_bn_fuse_pass",             // Execute BN passes again to
-             "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
-             "conv_transpose_bn_fuse_pass",   //
-             "conv_transpose_eltwiseadd_bn_fuse_pass",  //
-             "conv_bias_mkldnn_fuse_pass",              //
+             "depthwise_conv_mkldnn_pass",     //
+             "conv_bn_fuse_pass",              // Execute BN passes again to
+             "conv_eltwiseadd_bn_fuse_pass",   // preserve correct pass order
+             "conv_affine_channel_fuse_pass",  //
+             "conv_eltwiseadd_affine_channel_fuse_pass",  //
+             "conv_transpose_bn_fuse_pass",               //
+             "conv_transpose_eltwiseadd_bn_fuse_pass",    //
+             "conv_bias_mkldnn_fuse_pass",                //
              "conv_transpose_bias_mkldnn_fuse_pass",
              "conv3d_bias_mkldnn_fuse_pass",  //
              "conv_elementwise_add_mkldnn_fuse_pass",
@@ -223,4 +229,12 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
 #endif
 }
 
+void CpuPassStrategy::EnableMkldnnBfloat16() {
+#ifdef PADDLE_WITH_MKLDNN
+  use_mkldnn_bfloat16_ = true;
+#else
+  use_mkldnn_bfloat16_ = false;
+#endif
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index c5a4a5f754d031a8e8f88a96dd16c89fbe1b0fbb..9073253520466a3711089bc7b7da04a9191e0a42 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -132,6 +132,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \brief Enable MKLDNN quantize optimization.
   virtual void EnableMkldnnQuantizer() {}
 
+  /// \brief Enable MKLDNN bfloat16.
+  virtual void EnableMkldnnBfloat16() {}
+
   /// \brief Check if we are using gpu.
   /// \return A bool variable implying whether we are in gpu mode.
   bool use_gpu() const { return use_gpu_; }
@@ -161,6 +164,7 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
     use_gpu_ = other.use_gpu_;
     use_mkldnn_ = other.use_mkldnn_;
     use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
+    use_mkldnn_bfloat16_ = other.use_mkldnn_bfloat16_;
   }
   /// \brief Default destructor.
   virtual ~CpuPassStrategy() = default;
@@ -174,9 +178,13 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
   /// \brief Enable MKLDNN quantize optimization.
   void EnableMkldnnQuantizer() override;
 
+  /// \brief Enable MKLDNN bfloat16.
+  void EnableMkldnnBfloat16() override;
+
  protected:
   /// \cond Protected
   bool use_mkldnn_quantizer_{false};
+  bool use_mkldnn_bfloat16_{false};
   /// \endcond
 };
 
@@ -205,6 +213,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
   /// \brief Not supported in GPU mode yet.
   void EnableMkldnnQuantizer() override;
 
+  /// \brief Not supported in GPU mode yet.
+  void EnableMkldnnBfloat16() override;
+
   /// \brief Default destructor.
   virtual ~GpuPassStrategy() = default;
 
diff --git a/paddle/fluid/inference/capi/paddle_c_api.h b/paddle/fluid/inference/capi/paddle_c_api.h
index 4be6b48fb1820dc3271de164e87387c73ee67da9..32129890d02a2a0e0b357a6e0402d07b56bc6509 100644
--- a/paddle/fluid/inference/capi/paddle_c_api.h
+++ b/paddle/fluid/inference/capi/paddle_c_api.h
@@ -235,6 +235,12 @@ PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnQuantizer(
 PADDLE_CAPI_EXPORT extern bool PD_MkldnnQuantizerEnabled(
     const PD_AnalysisConfig* config);
 
+PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnBfloat16(
+    PD_AnalysisConfig* config);
+
+PADDLE_CAPI_EXPORT extern bool PD_MkldnnBfloat16Enabled(
+    const PD_AnalysisConfig* config);
+
 PADDLE_CAPI_EXPORT extern void PD_SetModelBuffer(PD_AnalysisConfig* config,
                                                  const char* prog_buffer,
                                                  size_t prog_buffer_size,
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index f5445dd5a3f9b6499045361a36fd6363a79ef560..b99abc06b27ecb9686b4c6e883aaaf8b3e592415 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -207,6 +207,18 @@ bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
   return config->config.mkldnn_quantizer_enabled();
 }
 
+void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
+                                      "PD_AnalysisConfig should not be null"));
+  config->config.EnableMkldnnBfloat16();
+}
+
+bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
+                                      "PD_AnalysisConfig should not be null"));
+  return config->config.mkldnn_bfloat16_enabled();
+}
+
 void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
                        size_t prog_buffer_size, const char* params_buffer,
                        size_t params_buffer_size) {
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 97d09925b19c4911a6b412518dc58fe88da16f64..10c212c0b4fa394e3c745bf524ef9d081c4bc3c1 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -51,7 +51,13 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   if (enable_int8) {
 #if IS_TRT_VERSION_GE(5000)
-    CHECK(op_desc.HasAttr("Input_scale"));
+    if (op_desc.Type() != "conv2d_transpose") {
+      PADDLE_ENFORCE_EQ(
+          op_desc.HasAttr("Input_scale"), true,
+          platform::errors::InvalidArgument("Input scale not found. TRT int8"
+                                            " requires conv/deconv to have "
+                                            "input quantization scales."));
+    }
     float in_scale =
         BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
     auto weight_scale =
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
index 24cd8e0368182ae597e48765bc0167ca1eca6bd3..5cfa3d86377874d0937964339a8b60a3ebd2486f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -54,7 +54,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
     auto ptr = new SkipLayerNormPluginDynamic(
         bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
     ptr->bias_gpu_ = bias_gpu_;
-    ptr->scale_gpu_ = bias_gpu_;
+    ptr->scale_gpu_ = scale_gpu_;
     return ptr;
   }
 
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 1b79c77c69e162a6f96a1762a4949386a7dadde4..2bd30bc05179e2881c4ecb321d76d5506233cc0e 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -192,7 +192,8 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
 inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
     EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
-set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150) 
+
+set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
 
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
@@ -514,3 +515,9 @@ if(WITH_MKLDNN)
 inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc 
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
         ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
+
+if(WITH_GPU)
+  inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${RESNET50_MODEL_DIR})
+endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
index c60e0a25f28c01c453276a8ef04eb79b35b7dda2..da0c93d21b7852e06b6805230078540063c2b243 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
@@ -54,6 +54,9 @@ TEST(PD_AnalysisConfig, use_gpu) {
   PD_SwitchIrOptim(config, true);
   bool ir_optim = PD_IrOptim(config);
   CHECK(ir_optim) << "NO";
+  PD_EnableMkldnnBfloat16(config);
+  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
+  CHECK(!bfloat16_enable) << "NO";
   PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false,
                           false);
   bool trt_enable = PD_TensorrtEngineEnabled(config);
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
index 93fcb43447d01dcafa10d8c85234d243d5095d4e..e24706691ed834ac4f49d924162035ec565d24ea 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -88,6 +88,9 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
   PD_EnableMkldnnQuantizer(config);
   bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
   CHECK(quantizer_enable) << "NO";
+  PD_EnableMkldnnBfloat16(config);
+  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
+  CHECK(bfloat16_enable) << "NO";
   PD_SetMkldnnCacheCapacity(config, 0);
   PD_SetModel(config, prog_file.c_str(), params_file.c_str());
   PD_DeleteAnalysisConfig(config);
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index 5840a4c42b3b1065410dc1509cf0cee2480bd596..31701c59ec33dfced5745f7f16d8f00ffce462ef 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -72,3 +72,59 @@ TEST(AnalysisPredictor, use_gpu) {
 
 }  // namespace inference
 }  // namespace paddle
+
+namespace paddle_infer {
+
+TEST(Predictor, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  Config config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableLiteEngine(PrecisionType::kFloat32);
+
+  auto predictor = CreatePredictor(config);
+  const int batch = 1;
+  const int channel = 3;
+  const int height = 318;
+  const int width = 318;
+  const int input_num = batch * channel * height * width;
+  std::vector<float> input(input_num, 1);
+
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+
+  input_t->Reshape({1, 3, 318, 318});
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  size_t out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                   std::multiplies<int>());
+
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+
+  const std::vector<float> truth_values = {
+      127.780396f, 738.16656f,  1013.2264f,  -438.17206f, 366.4022f,
+      927.66187f,  736.2241f,   -633.68567f, -329.92737f, -430.15637f,
+      -633.0639f,  -146.54858f, -1324.2804f, -1349.3661f, -242.67671f,
+      117.44864f,  -801.7251f,  -391.51495f, -404.8202f,  454.16132f,
+      515.48206f,  -133.03114f, 69.293076f,  590.09753f,  -1434.6917f,
+      -1070.8903f, 307.0744f,   400.52573f,  -316.12177f, -587.1265f,
+      -161.05742f, 800.3663f,   -96.47157f,  748.708f,    868.17645f,
+      -447.9403f,  112.73656f,  1127.1992f,  47.43518f,   677.7219f,
+      593.1881f,   -336.4011f,  551.3634f,   397.82474f,  78.39835f,
+      -715.4006f,  405.96988f,  404.25684f,  246.01978f,  -8.430191f,
+      131.36617f,  -648.0528f};
+
+  float* data_o = out_data.data();
+  for (size_t j = 0; j < out_num; j += 10) {
+    EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0.,
+                10e-5);
+  }
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fee7c35581d3293f0036360b64961910d9eb02a7
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda_runtime.h>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cstring>
+#include <numeric>
+
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+
+namespace paddle_infer {
+
+TEST(Predictor, use_gpu) {
+  LOG(INFO) << GetVersion();
+  UpdateDllFlag("conv_workspace_size_limit", "4000");
+  std::string model_dir = FLAGS_infer_model + "/model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableUseGpu(100, 0);
+
+  auto predictor = CreatePredictor(config);
+  auto pred_clone = predictor->Clone();
+
+  std::vector<int> in_shape = {1, 3, 318, 318};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+
+  std::vector<float> input(in_num, 0);
+
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+  predictor->ClearIntermediateTensor();
+}
+
+TEST(PredictorPool, basic) {
+  LOG(INFO) << GetVersion();
+  UpdateDllFlag("conv_workspace_size_limit", "4000");
+  std::string model_dir = FLAGS_infer_model + "/model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableUseGpu(100, 0);
+
+  services::PredictorPool pred_pool(config, 4);
+  auto pred = pred_pool.Retrive(2);
+
+  std::vector<int> in_shape = {1, 3, 318, 318};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+  std::vector<float> input(in_num, 0);
+
+  auto in_names = pred->GetInputNames();
+  auto input_t = pred->GetInputHandle(in_names[0]);
+  input_t->name();
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  pred->Run();
+  auto out_names = pred->GetOutputNames();
+  auto output_t = pred->GetOutputHandle(out_names[0]);
+  auto out_type = output_t->type();
+  LOG(INFO) << GetNumBytesOfDataType(out_type);
+  if (out_type == DataType::FLOAT32) {
+    PlaceType place;
+    int size;
+    output_t->data<float>(&place, &size);
+  }
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
index 8ffa3efdf0556bd7cde7efa615f60853ad18d903..c7c7356b6e8831bc0bcd0e9ea4ad0fbdec8b6be2 100644
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
@@ -41,7 +41,7 @@ TEST(AnalysisPredictor, use_gpu) {
   SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
 
   std::vector<PaddleTensor> outputs;
-  for (auto& input : inputs_all) {
+  for (auto &input : inputs_all) {
     ASSERT_TRUE(predictor->Run(input, &outputs));
     predictor->ClearIntermediateTensor();
   }
@@ -49,3 +49,27 @@ TEST(AnalysisPredictor, use_gpu) {
 
 }  // namespace inference
 }  // namespace paddle
+
+namespace paddle_infer {
+TEST(PredictorPool, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  Config config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
+  config.EnableTensorRtEngine();
+  services::PredictorPool pred_pool(config, 1);
+
+  auto predictor = pred_pool.Retrive(0);
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  std::vector<int> in_shape = {1, 3, 224, 224};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+
+  std::vector<float> input(in_num, 0);
+  input_t->Reshape(in_shape);
+  input_t->CopyFromCpu(input.data());
+  predictor->Run();
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 63b3b0f1a3408154a2d1c8aff76a85a95ad044f6..81bb6881fae69b7af494449164f4fed35ade24da 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
@@ -1231,3 +1232,24 @@ REGISTER_OP_CPU_KERNEL(
     ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
                               ops::AbsGradFunctor<int64_t>>);
 /* ========================================================================== */
+
+/* ==========================  register checkpoint ===========================*/
+REGISTER_OP_VERSION(leaky_relu)
+    .AddCheckpoint(
+        R"ROC(fix leaky_relu, bahavior changed when alpha < 0 or alpha > 1)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .BugfixWithBehaviorChanged(
+                "leaky_relu calculate formula before checkponit: out = max(x, "
+                "alpha * x); after checkpoint: out = x if x > 0 else alpha * "
+                "x"));
+
+REGISTER_OP_VERSION(hard_shrink)
+    .AddCheckpoint(
+        R"ROC(fix hard_shrink, bahavior changed when threshold<0)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .BugfixWithBehaviorChanged(
+                "hard_shrink calculate formula before checkponit: out = x * "
+                "((x < -threshold) + (x > threshold)); after checkpoint: out = "
+                "x * (((x < -threshold) + (x > threshold)) > 0)"));
+
+/* ========================================================================== */
diff --git a/paddle/fluid/operators/bernoulli_op.cc b/paddle/fluid/operators/bernoulli_op.cc
index c525da5953d76d4406fbdd0d9d6e98619e409f71..79c4e2c2bba3191535f53e2ef2a32cd66e36230c 100644
--- a/paddle/fluid/operators/bernoulli_op.cc
+++ b/paddle/fluid/operators/bernoulli_op.cc
@@ -64,11 +64,11 @@ class BernoulliOpKernel<platform::CPUDeviceContext, T>
 
     int64_t size = x->numel();
     std::uniform_real_distribution<T> dist(0.0, 1.0);
-    auto gen_ptr = framework::Generator::GetInstance();
-    std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
+    auto gen_ptr = framework::DefaultCPUGenerator();
+    auto engine = gen_ptr->GetCPUEngine();
 
     for (int64_t i = 0; i < size; ++i) {
-      out_data[i] = BernoulliFunctor(in_data[i], dist(gen_engine));
+      out_data[i] = BernoulliFunctor(in_data[i], dist(*engine));
     }
   }
 };  // namespace operators
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index a8485a148b17c1a084b9d294c998531ec3a8e071..03abfe7eb703b021dac2261dcd9c87d440b04001 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -66,7 +66,7 @@ template <typename DeviceContext, typename T>
 class ClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto max = static_cast<T>(context.Attr<float>("max"));
+    auto max = context.Attr<T>("max");
     Tensor max_cpu;
     if (context.HasInput("Max")) {
       auto* max_t = context.Input<Tensor>("Max");
@@ -77,9 +77,8 @@ class ClipKernel : public framework::OpKernel<T> {
       }
       max = max_data[0];
     }
-    max = static_cast<T>(max);
 
-    auto min = context.Attr<float>("min");
+    auto min = context.Attr<T>("min");
     Tensor min_cpu;
     if (context.HasInput("Min")) {
       auto* min_t = context.Input<Tensor>("Min");
@@ -90,11 +89,12 @@ class ClipKernel : public framework::OpKernel<T> {
       }
       min = min_data[0];
     }
-    min = static_cast<T>(min);
-    PADDLE_ENFORCE_LT(min, max, platform::errors::InvalidArgument(
-                                    "max should be greater than min. "
-                                    "But received min = %f, max = %f",
-                                    min, max));
+
+    PADDLE_ENFORCE_LE(min, max,
+                      platform::errors::InvalidArgument(
+                          "max should be greater than or equal to min. "
+                          "But received min = %f, max = %f",
+                          min, max));
 
     auto* x_var = context.InputVar("X");
     if (x_var->IsType<framework::LoDTensor>()) {
@@ -141,7 +141,7 @@ template <typename DeviceContext, typename T>
 class ClipGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto max = static_cast<T>(context.Attr<float>("max"));
+    auto max = context.Attr<T>("max");
     Tensor max_cpu;
     if (context.HasInput("Max")) {
       auto* max_t = context.Input<Tensor>("Max");
@@ -152,9 +152,8 @@ class ClipGradKernel : public framework::OpKernel<T> {
       }
       max = max_data[0];
     }
-    max = static_cast<T>(max);
 
-    auto min = context.Attr<float>("min");
+    auto min = context.Attr<T>("min");
     Tensor min_cpu;
     if (context.HasInput("Min")) {
       auto* min_t = context.Input<Tensor>("Min");
@@ -165,7 +164,6 @@ class ClipGradKernel : public framework::OpKernel<T> {
       }
       min = min_data[0];
     }
-    min = static_cast<T>(min);
 
     auto* d_out =
         context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index 74589dcb6a74c79299ef682de0bce146f33ec261..fb8cde70f5324f42fbc05fdfd65b548e0e58206a 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/logical_op.h"
+#include <algorithm>
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -97,19 +99,19 @@ class BinaryLogicalOp : public LogicalOp {
     OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type);
     auto dim_x = context->GetInputDim("X");
     auto dim_y = context->GetInputDim("Y");
-
-    int product_x = framework::product(dim_x);
-    int product_y = framework::product(dim_y);
-    bool check = context->IsRuntime() || (product_x >= 0 && product_y >= 0);
-    if (check) {
-      PADDLE_ENFORCE_EQ(product_x, product_y,
-                        platform::errors::InvalidArgument(
-                            "The number of elements in X and Y should be same, "
-                            "but received %d != %d",
-                            product_x, product_y));
+    if (dim_x == dim_y) {
+      context->SetOutputDim("Out", dim_x);
+    } else {
+      int max_dim = std::max(dim_x.size(), dim_y.size());
+      int axis = std::abs(dim_x.size() - dim_y.size());
+      std::vector<int> x_dims_array(max_dim);
+      std::vector<int> y_dims_array(max_dim);
+      std::vector<int> out_dims_array(max_dim);
+      GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(),
+                             y_dims_array.data(), out_dims_array.data(),
+                             max_dim, axis);
+      context->SetOutputDim("Out", framework::make_ddim(out_dims_array));
     }
-
-    context->SetOutputDim("Out", context->GetInputDim("X"));
     context->ShareLoD("X", "Out");
   }
 };
diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h
index 4a83e0fda6e4ecdb1112f096eb37159337c37147..2c39201a426a25bb8595f415d80192080f1f1931 100644
--- a/paddle/fluid/operators/controlflow/logical_op.h
+++ b/paddle/fluid/operators/controlflow/logical_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <math.h>
 #include <type_traits>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
@@ -57,10 +58,8 @@ class BinaryLogicalOpKernel
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor binary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(), y->data<T>(),
-          out->mutable_data<bool>(context.GetPlace()), binary_func);
+    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, -1,
+                                                          binary_func, out);
   }
 };
 
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 25e887ba6675e6c28bcd44c3b57c2ea571c075e3..7e0e77214c5320aa9a807fc65531f163fa7ce09e 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -567,3 +568,14 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
+
+REGISTER_OP_VERSION(conv_transpose)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade convtranspose add a new attribute [output_padding].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "output_padding",
+            "In order to add additional size to one side of each dimension "
+            "in the output",
+            {}));
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
old mode 100755
new mode 100644
index 2e9db16be5530f65e0cf5d4d99ee2ad936105983..89ec1ddd12b9d8da8dba604ae4e759054212608e
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/cum_op.h"
 
 namespace paddle {
@@ -95,3 +96,14 @@ REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<int>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<int64_t>>);
+
+REGISTER_OP_VERSION(cumsum)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade cumsum add a new attribute [flatten].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "flatten",
+            "In order to compute the cumsum over the flattened array when the "
+            "argument `axis` in python API is None.",
+            false));
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index a033611f478f9ea44fd49ab2015e78aaea6aacd9..e584e025088151cb9a6a64045387548d30a9eebf 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -56,7 +56,7 @@ endif()
 
 
 cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op)
+    DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op scale_op)
 cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index edbe945cd72bda15b506305dbfe80a3dbe085908..0983b4a406e042f094965ad9a7de437684940fa9 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -132,6 +132,15 @@ void ProcGetResponse(const VarHandle& var_h,
                             &trainer_id);
 }
 
+void ProcGetRecvResponse(const VarHandle& var_h,
+                         const ::grpc::ByteBuffer& ret_msg) {
+  VLOG(4) << "ProcGetRecvResponse";
+  framework::Variable* outvar = nullptr;
+  int trainer_id;
+  DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
+                                &trainer_id);
+}
+
 template <typename T>
 void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
   ::grpc::Slice slice(proto.ByteSizeLong());
@@ -482,6 +491,79 @@ VarHandlePtr GRPCClient::AsyncDistributeNotify(
   return h;
 }
 
+VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep,
+                                          const platform::DeviceContext& ctx,
+                                          const framework::Scope& scope,
+                                          const std::string& send_var_name,
+                                          const std::string& recv_var_name,
+                                          const std::string& table_name,
+                                          int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string send_var_name_val = send_var_name;
+  const std::string recv_var_name_val = recv_var_name;
+  const std::string table_name_val = table_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+  const std::string method = kSendAndRecvRPC;
+  VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: "
+          << send_var_name_val << " Recv_var_name: " << recv_var_name_val;
+  int retry_times_ = 0;
+
+  while (true) {
+    SendAndRecvProcessor* s = new SendAndRecvProcessor(ch);
+    VarHandlePtr h(
+        new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope));
+    VarHandlePtr h_recv(
+        new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope));
+    s->Prepare(h, time_out);
+    s->RecvPrepare(h_recv);
+
+    framework::AsyncIO([send_var_name_val, recv_var_name_val, table_name_val,
+                        p_scope, p_ctx, s, method, h, this] {
+      auto* send_var = p_scope->FindVar(send_var_name_val);
+      send_var->GetMutable<framework::LoDTensor>()->set_lod({});
+      ::grpc::ByteBuffer buf;
+      VLOG(4) << "SerializeToByteBuffer: send_var_name_val: "
+              << send_var_name_val
+              << " recv_var_name_val: " << recv_var_name_val;
+      SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf,
+                            recv_var_name_val, trainer_id_, table_name_val);
+
+      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+
+      // stub context
+      s->response_call_back_ = ProcGetRecvResponse;
+
+      platform::RecordRPCEvent record_event(method);
+
+      auto call = s->stub_g_.PrepareUnaryCall(
+          s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable",
+          buf, &cq_);
+      call->StartCall();
+      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+
+      if (UNLIKELY(platform::IsProfileEnabled())) {
+        h->Wait();
+      }
+    });
+    req_count_++;
+
+    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
+      h->Wait();
+      if (h->should_retry) {
+        VLOG(3) << "rpc call failed, retry times " << retry_times_;
+        retry_times_++;
+        std::random_device rd;
+        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
+        continue;
+      }
+    }
+
+    return h;
+  }
+}
+
 bool GRPCClient::Wait() {
   std::unique_lock<std::mutex> lk(sync_mutex_);
   sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
index bd9f25567dc07381ac8f9010b8a41bbe49c50017..6b6249540c6d15954743c414a60472bf1f831151 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -53,6 +53,8 @@ namespace distributed {
 
 void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
 
+void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
+
 class BaseProcessor {
  public:
   BaseProcessor() { context_ = nullptr; }
@@ -131,6 +133,28 @@ class GetProcessor : public BaseProcessor {
   RequestGetCallBack response_call_back_ = ProcGetResponse;
 };
 
+class SendAndRecvProcessor : public BaseProcessor {
+ public:
+  explicit SendAndRecvProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(), stub_g_(ch) {}
+
+  virtual ~SendAndRecvProcessor() {}
+
+  void ProcessImpl() override {
+    if (response_call_back_) {
+      response_call_back_(*var_h_recv_.get(), reply_);
+      var_h_recv_->Finish(true);
+    }
+  }
+
+  void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; }
+
+  ::grpc::ByteBuffer reply_;
+  ::grpc::GenericStub stub_g_;
+  RequestGetCallBack response_call_back_ = ProcGetResponse;
+  VarHandlePtr var_h_recv_;
+};
+
 class BatchBarrierProcessor : public BaseProcessor {
  public:
   explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
@@ -231,6 +255,14 @@ class GRPCClient : public RPCClient {
       const framework::Scope& scope, const std::string& var_name,
       int64_t time_out = FLAGS_rpc_deadline) override;
 
+  VarHandlePtr AsyncSendAndRecv(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& send_var_name,
+                                const std::string& recv_var_name,
+                                const std::string& table_name = "",
+                                int64_t time_out = FLAGS_rpc_deadline) override;
+
   VarHandlePtr AsyncSendComplete(
       const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
 
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
index bb9719eaad0447cbc298fbd7ed9ec635ae6df58d..eddd89cf20c2eb91e88d666a6ffe4a045df7298b 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -76,7 +76,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
     PADDLE_THROW("Serialize does not support type: %s",
                  typeid(var->Type()).name());
   }
-
   std::string header;
   request.AppendToString(&header);
   auto buffer = std::unique_ptr<char[]>(new char[1024]);
@@ -101,7 +100,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   }
 #endif
   PADDLE_ENFORCE_NOT_NULL(payload);
-
   e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
                             payload->memory_size());
   if (payload->memory_size() >= std::numeric_limits<int>::max()) {
@@ -140,7 +138,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         ::grpc::Slice::STEAL_REF);
     num_slices = 4;
   }
-
   ::grpc::ByteBuffer tmp(&slices[0], num_slices);
   msg->Swap(&tmp);
 }
@@ -156,6 +153,19 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
   *trainer_id = resp.GetTrainerId();
 }
 
+void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                                   const platform::DeviceContext& ctx,
+                                   const framework::Scope* scope,
+                                   framework::Variable** var, int* trainer_id) {
+  platform::RecordRPCEvent record_event("deserial");
+  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE_EQ(
+      resp.Parse(msg), 0,
+      platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
+  *var = resp.GetRecvVar();
+  *trainer_id = resp.GetTrainerId();
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
index c9a57beb3a6a7a7cc9973ff0e5325a3daa6d98a9..30e6907656e25bc7bcae77d3bd02638f6bb7601d 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
@@ -47,6 +47,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const framework::Scope* scope,
                                framework::Variable** var, int* trainer_id);
 
+void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                                   const platform::DeviceContext& ctx,
+                                   const framework::Scope* scope,
+                                   framework::Variable** var, int* trainer_id);
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index e7effcc1805f83eb16f07ceb7db53ce08983ad60..5c0232a50a9066f782be5269b4041958748c2e23 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -28,6 +28,7 @@ DECLARE_int32(rpc_retry_bind_port);
 namespace paddle {
 namespace operators {
 namespace distributed {
+
 enum CallStatus { PROCESS = 0, FINISH };
 
 // reference:
@@ -433,6 +434,51 @@ class RequestNotify final : public RequestBase {
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
 
+class RequestSendAndRecv final : public RequestBase {
+ public:
+  explicit RequestSendAndRecv(GrpcService::AsyncService* service,
+                              ::grpc::ServerCompletionQueue* cq,
+                              RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new GRPCVariableResponse(
+        request_handler->scope(), request_handler->dev_ctx(),
+        request_handler->distributed_mode()));
+
+    int method_id =
+        static_cast<int>(distributed::GrpcMethod::kRequestSendAndRecv);
+
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestSendAndRecv() {}
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    std::string in_var_name = request_->Varname();
+    std::string out_var_name = request_->OutVarname();
+    std::string table_name = request_->TableName();
+    int trainer_id = request_->GetTrainerId();
+
+    VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name
+            << " out_var_name: " << out_var_name << " trainer: " << trainer_id;
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = scope->FindVar(in_var_name);
+    framework::Variable* outvar = nullptr;
+    request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
+                             out_var_name, table_name);
+    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
+                          &reply_);
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  std::shared_ptr<GRPCVariableResponse> request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+};
+
 void AsyncGRPCServer::WaitServerReady() {
   VLOG(4) << "AsyncGRPCServer is waiting server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
@@ -586,6 +632,8 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
     b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id);
   } else if (rpc_name == kRequestNotify) {
     b = new RequestNotify(service_.get(), cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestSendAndRecv) {
+    b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id);
   } else {
     PADDLE_ENFORCE(false, "not supported rpc");
   }
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
index 45152293896e86806fe87324416c2588796558ba..95b6810ec61977b70617c9f20c2e75775157a6fb 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h
@@ -85,10 +85,12 @@ enum class GrpcMethod {
   kGetMonomerVariable,
   kGetMonomerBarrier,
   kRequestNotify,
+  kRequestSendAndRecv,
+  // when you add new handler, change kGrpcNumMethods at the same time!
 };
 
 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kRequestNotify) + 1;
+    static_cast<int>(GrpcMethod::kRequestSendAndRecv) + 1;
 
 inline const char* GrpcMethodName(GrpcMethod id) {
   switch (id) {
@@ -108,6 +110,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/CheckpointNotify";
     case GrpcMethod::kRequestNotify:
       return "/sendrecv.SendRecvService/DistributeNotify";
+    case GrpcMethod::kRequestSendAndRecv:
+      return "/sendrecv.SendRecvService/SendAndRecvVariable";
   }
 
   // Shouldn't be reached.
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h
index 0d7032e286caab93dbd38f35881e9064694a8307..9e39e68cba779de4dc598046e45f7d35e292bb79 100644
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ b/paddle/fluid/operators/distributed/large_scale_kv.h
@@ -14,20 +14,19 @@
 
 #pragma once
 
+#include <ThreadPool.h>
 #include <gflags/gflags.h>
 
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
 #include <string>
+#include <thread>  // NOLINT
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
-#include <thread>  // NOLINT
-
-#include <ThreadPool.h>
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/rw_lock.h"
@@ -89,26 +88,17 @@ class UniformInitializer : public Initializer {
     min_ = std::stof(attrs[2]);
     max_ = std::stof(attrs[3]);
 
-    if (seed_ == 0) {
-      seed_ = std::random_device()();
-    }
-
-    random_engine_.seed(seed_);
     dist_ = std::uniform_real_distribution<float>(min_, max_);
+    random_engine_ = framework::GetCPURandomEngine(seed_);
   }
 
-  float GetValue() override {
-    return framework::Generator::GetInstance()->is_init_py
-               ? dist_(framework::Generator::GetInstance()->GetCPUEngine())
-               : dist_(random_engine_);
-    // return dist_(random_engine_);
-  }
+  float GetValue() override { return dist_(*random_engine_); }
 
  private:
   float min_;
   float max_;
 
-  std::minstd_rand random_engine_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
   std::uniform_real_distribution<float> dist_;
 };
 
@@ -139,26 +129,18 @@ class GaussianInitializer : public Initializer {
     mean_ = std::stof(attrs[2]);
     std_ = std::stof(attrs[3]);
 
-    if (seed_ == 0) {
-      seed_ = std::random_device()();
-    }
+    random_engine_ = framework::GetCPURandomEngine(seed_);
 
-    random_engine_.seed(seed_);
     dist_ = std::normal_distribution<float>(mean_, std_);
   }
 
-  float GetValue() override {
-    return framework::Generator::GetInstance()->is_init_py
-               ? dist_(framework::Generator::GetInstance()->GetCPUEngine())
-               : dist_(random_engine_);
-    // return dist_(random_engine_);
-  }
+  float GetValue() override { return dist_(*random_engine_); }
 
  private:
   float std_;
   float mean_;
 
-  std::minstd_rand random_engine_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
   std::normal_distribution<float> dist_;
 };
 
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 59531c0ec78ed8f0ec60a94d48069685e5b8c1a2..44359af1b1b2a6a161adcc83b97ea5fad96eecb0 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -46,6 +46,7 @@ constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
 constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
 constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
 constexpr char kRequestNotify[] = "RequestNotify";
+constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv";
 
 constexpr char kSendRPC[] = "SendRPC";
 constexpr char kGetRPC[] = "GetRPC";
@@ -57,6 +58,7 @@ constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
 constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
 constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
 constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
+constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC";
 constexpr int64_t kPrefetchTimeout = 60000;
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index e99b0ed4072645fcbc3ef4ce8728fc0f9cd912a3..761a4edc523da52ffdbdd2039444c133e8da368c 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -325,6 +325,22 @@ bool RequestNotifyHandler::Handle(const std::string &varname,
   return true;
 }
 
+bool RequestSendAndRecvHandler::Handle(const std::string &varname,
+                                       framework::Scope *Scope,
+                                       framework::Variable *var,
+                                       framework::Variable **outvar,
+                                       const int trainer_id,
+                                       const std::string &out_var_name,
+                                       const std::string &table_name) {
+  VLOG(3) << "SendAndRecvHandle: " << varname
+          << " out_var_name: " << out_var_name
+          << " , trainer_id:  " << trainer_id;
+
+  executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope);
+  *outvar = Scope->FindVar(out_var_name);
+  return true;
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index f22a133c2d5b1196a672f978d76d1c362f616bf6..42621724e68f40617bebd2b01e2af5dd23387163 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -176,6 +176,17 @@ class RequestNotifyHandler final : public RequestHandler {
   std::unordered_map<int, int64_t> decay_counters;
 };
 
+class RequestSendAndRecvHandler final : public RequestHandler {
+ public:
+  explicit RequestSendAndRecvHandler(int distributed_mode)
+      : RequestHandler(distributed_mode) {}
+  virtual ~RequestSendAndRecvHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* Scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
+};
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 62313222775c662b78bfab5827cd5b418a2a0997..69a5e3274318337f5424afa6492da829e04daa69 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -85,6 +85,12 @@ class RPCClient {
       const framework::Scope& scope, const std::string& var_name,
       int64_t time_out = FLAGS_rpc_deadline) = 0;
 
+  virtual VarHandlePtr AsyncSendAndRecv(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& send_var_name,
+      const std::string& recv_var_name, const std::string& table_name = "",
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
   virtual VarHandlePtr AsyncSendComplete(
       const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
 
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index 67e11120b808e26df590440389c71f3340738082..5ce7ac85269572ea7d2b6a015bb6c9d106f8199e 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -35,27 +35,24 @@ namespace platform = paddle::platform;
 namespace distributed = paddle::operators::distributed;
 
 USE_NO_KERNEL_OP(lookup_sparse_table_read);
+USE_OP(scale);
 
 std::unique_ptr<distributed::RPCServer> g_rpc_service;
 std::unique_ptr<distributed::RequestHandler> g_req_handler;
 
-framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
+framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
   auto root_block = program->MutableBlock(0);
   auto* block = program->AppendBlock(*root_block);
 
-  framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
-  framework::VariableNameMap output({{"Output", {"out"}}});
-  auto op = block->AppendOp();
-  op->SetType("lookup_sparse_table_read");
-  op->SetInput("W", {"w"});
-  op->SetInput("Ids", {"ids"});
-  op->SetOutput("Out", {"out"});
-  op->SetAttr("tablename", {"w"});
-  op->SetAttr("value_names", {"Param"});
-
-  auto& out = *root_block->Var("out");
+  framework::OpDesc* op = block->AppendOp();
+  op->SetType("scale");
+  op->SetInput("X", {"x"});
+  op->SetOutput("Out", {"res"});
+  op->SetAttr("scale", 0.5f);
+
+  auto& out = *root_block->Var("res");
   out.SetType(framework::proto::VarType::LOD_TENSOR);
-  out.SetShape({10, 10});
+  out.SetShape({1, 10});
 
   return block;
 }
@@ -69,6 +66,12 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
 
   auto ids_var = scope->Var("ids");
   ids_var->GetMutable<framework::LoDTensor>();
+
+  auto x_var = scope->Var("x");
+  x_var->GetMutable<framework::LoDTensor>();
+
+  auto res_var = scope->Var("res");
+  res_var->GetMutable<framework::LoDTensor>();
 }
 
 void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
@@ -78,6 +81,11 @@ void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
   int64_t* ids_ptr =
       ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
   for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
+
+  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
+  float* x_ptr =
+      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
 }
 
 void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
@@ -124,6 +132,38 @@ void StartServer(const std::string& rpc_name) {
   server_thread.join();
 }
 
+void StartSendAndRecvServer(const std::string& rpc_name) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+  auto block = AppendSendAndRecvBlock(&program);
+  std::string in_var_name("x");
+  std::vector<int> prefetch_block_ids{block->ID()};
+  auto prepared = exe.Prepare(program, prefetch_block_ids);
+  InitTensorsOnServer(&scope, &place, 10);
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      grad_to_prepared_ctx;
+  grad_to_prepared_ctx[in_var_name] = prepared[0];
+
+  g_req_handler->SetProgram(&program);
+  g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  g_req_handler->SetDevCtx(&ctx);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetExecutor(&exe);
+
+  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
+
+  std::thread server_thread(
+      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
+
+  server_thread.join();
+}
+
 TEST(COMPLETE, CPU) {
   setenv("http_proxy", "", 1);
   setenv("https_proxy", "", 1);
@@ -147,3 +187,46 @@ TEST(COMPLETE, CPU) {
   g_rpc_service.reset(nullptr);
   g_req_handler.reset(nullptr);
 }
+
+TEST(SENDANDRECV, CPU) {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  g_req_handler.reset(new distributed::RequestSendAndRecvHandler(
+      distributed::DistributedMode::kAsync));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+  PADDLE_ENFORCE_NE(client, nullptr,
+                    platform::errors::InvalidArgument(
+                        "Client Start Fail, Check Your Code & Env"));
+  std::thread server_thread(StartSendAndRecvServer,
+                            distributed::kRequestSendAndRecv);
+  g_rpc_service->WaitServerReady();
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  // create var on local scope
+  int64_t rows_numel = 10;
+  InitTensorsOnClient(&scope, &place, rows_numel);
+  std::string in_var_name("x");
+  std::string out_var_name("res");
+
+  client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name);
+  client->Wait();
+  auto var = scope.Var(out_var_name);
+  auto value = var->GetMutable<framework::LoDTensor>();
+  auto ptr = value->mutable_data<float>(place);
+
+  for (int64_t i = 0; i < rows_numel; ++i) {
+    EXPECT_EQ(ptr[i], 0.5);
+  }
+  g_rpc_service->ShutDown();
+  server_thread.join();
+  LOG(INFO) << "begin reset";
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index 0337b72181cf9f612fe56ae24bad39775bfcde28..a333642bd16fbfbe648a835101d67218bf473cdb 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -29,7 +29,7 @@ service SendRecvService {
 
   rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
   rpc DistributeNotify(VariableMessage) returns (VoidMessage) {}
-
+  rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {}
   rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
   rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index 3cabcd22cd52222aff2555a8449e558de2c287c0..d979cd8a881ec7d697eae06b4911d597730b6908 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -96,6 +96,13 @@ class VariableResponse {
     return scope_->FindVar(meta_.varname());
   }
 
+  framework::Variable* GetRecvVar() {
+    if (create_scope_) {
+      return local_scope_->Var(meta_.out_varname());
+    }
+    return scope_->FindVar(meta_.out_varname());
+  }
+
   int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }
 
  protected:
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 5869407be5a5750d3948f87fe8743adf0a425422..5e1e408eb2c28239fded0d0cf037c94783828b50 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -268,7 +268,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
-
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
     block_list.push_back(blkid);
@@ -295,6 +294,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
   request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
   request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
   request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_send_and_recv_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
 
   while (true) {
     if (rpc_service_->IsExit()) {
@@ -394,6 +394,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
       new distributed::RequestGetNoBarrierHandler());
   request_notify_handler_.reset(
       new distributed::RequestNotifyHandler(distributed_mode, fan_in));
+  request_send_and_recv_handler_.reset(
+      new distributed::RequestSendAndRecvHandler(distributed_mode));
 
   rpc_service_->RegisterRPC(distributed::kRequestSend,
                             request_send_handler_.get(), rpc_send_thread_num);
@@ -408,6 +410,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                             request_get_no_barrier_handler_.get());
   rpc_service_->RegisterRPC(distributed::kRequestNotify,
                             request_notify_handler_.get(), rpc_send_thread_num);
+  rpc_service_->RegisterRPC(distributed::kRequestSendAndRecv,
+                            request_send_and_recv_handler_.get(),
+                            rpc_get_thread_num);
 
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -416,6 +421,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                         "optimize blocks is less than 1. Optimize blocks "
                         "should be 1 at least on the pserver side."));
   auto *program = optimize_blocks[0]->Program();
+
   framework::Executor executor(dev_place);
 
   std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
@@ -488,6 +494,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   f(request_checkpoint_handler_.get());
   f(request_get_no_barrier_handler_.get());
   f(request_notify_handler_.get());
+  f(request_send_and_recv_handler_.get());
 
   // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
   signal(SIGINT, SignalHandler::StopAndExit);
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
index 369743dfb2392c029bc3b671e519aefbbdd2b6b7..b41e4e87722f638e6661a5116ebdfbc02c32710f 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
@@ -99,6 +99,8 @@ class ListenAndServOp : public framework::OperatorBase {
   mutable std::shared_ptr<distributed::RequestHandler>
       request_checkpoint_handler_;
   mutable std::shared_ptr<distributed::RequestHandler> request_notify_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_send_and_recv_handler_;
 
   mutable std::shared_ptr<std::thread> server_thread_;
   mutable std::vector<std::string> sparse_vars_;
diff --git a/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc b/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..00cdbe70ca47e6e0bba8294b3b81c804b096339c
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/distributed/communicator.h"
+#include "paddle/fluid/operators/distributed/communicator_common.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/parameter_send.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SendAndRecvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& scope = ctx.scope();
+    const auto& place = ctx.GetPlace();
+    auto send_var_name = ctx.Attr<std::string>("send_var_name");
+    auto recv_var_name = ctx.Attr<std::string>("recv_var_name");
+    auto epmap = ctx.Attr<std::string>("endpoint");
+    auto trainer_id = ctx.Attr<int>("trainer_id");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& context = *pool.Get(place);
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
+    VLOG(3) << "SendAndRecvOp Send_var_name: " << send_var_name
+            << " Recv_var_name: " << recv_var_name;
+    distributed::VarHandlePtr rets = rpc_client->AsyncSendAndRecv(
+        epmap, context, scope, send_var_name, recv_var_name);
+    rets->Wait();
+  }
+};
+
+class SendAndRecvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
+    AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
+    AddAttr<std::string>("send_var_name", "Send Tensor's name")
+        .SetDefault(std::string(""));
+    AddAttr<std::string>("recv_var_name", "Recv Tensor's name")
+        .SetDefault(std::string(""));
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::string>("endpoint", "Server endpoint")
+        .SetDefault({"127.0.0.1:6164"});
+    AddComment(R"DOC(
+    SendAndRecv operator
+    This operator will send variables to listen_and_serve op at the parameter server.
+    And recv variable from parameter server of send variable's scope.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    send_and_recv,
+    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index bce4c7ca19a603fd2eadaff7f82b5cdec91bb79f..9d9eb4a82a075f27764a73d0e976dbf3f7181cb1 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -55,30 +55,22 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
         std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
         return;
       }
-
-      bool init_generator_py = framework::Generator::GetInstance()->is_init_py;
-
+      // std::minstd_rand engine;
       // NOTE: fixed seed should only be used in unittest or for debug.
       // Guarantee to use random seed in training.
-      std::random_device rnd;
-      std::minstd_rand engine;
-      int seed_data;
+      int seed_data = 0;
       if (seed) {
         seed_data = *(seed->data<int>());
       } else {
         seed_data =
-            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : 0;
       }
-      engine.seed(seed_data);
+      auto engine = framework::GetCPURandomEngine(seed_data);
 
       std::uniform_real_distribution<float> dist(0, 1);
 
       for (size_t i = 0; i < size; ++i) {
-        float cur_random =
-            init_generator_py
-                ? dist(framework::Generator::GetInstance()->GetCPUEngine())
-                : dist(engine);
-        if (cur_random < dropout_prob) {
+        if (dist(*engine) < dropout_prob) {
           mask_data[i] = 0;
           y_data[i] = 0;
         } else {
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
index a5909aad99a82529a0739cd28b1b72a146524f76..8afe2133c0488bbe04ec4803aac5dce6573f634d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -26,14 +26,34 @@ namespace operators {
 template <typename T>
 struct FloorDivFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const {
-    return static_cast<T>(floor(a / b));
+#ifdef __CUDA_ARCH__
+    if (b == 0) {
+      printf("Error: Divide by zero encounter in floor_divide\n");
+      asm("trap;");
+    }
+#else
+    if (b == 0)
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Divide by zero encounter in floor_divide"));
+#endif
+    return static_cast<T>(std::trunc(a / b));
   }
 };
 
 template <typename T>
 struct InverseFloorDivFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const {
-    return static_cast<T>(floor(b / a));
+#ifdef __CUDA_ARCH__
+    if (a == 0) {
+      printf("Error: Divide by zero encounter in floor_divide\n");
+      asm("trap;");
+    }
+#else
+    if (a == 0)
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Divide by zero encounter in floor_divide"));
+#endif
+    return static_cast<T>(std::trunc(b / a));
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
index 4306a471b76c5bd4f0a5284052d7d39aa5fbc279..47bd6af0b95ace2b9b753e38cfc5f191bc1bb942 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
@@ -24,13 +24,19 @@ namespace operators {
 
 template <typename T>
 struct ModFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a % b; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = a % b;
+    if ((res != 0) && ((res < 0) != (b < 0))) res += b;
+    return res;
+  }
 };
 
 template <typename T>
 struct ModFunctorFP {
   inline HOSTDEVICE T operator()(T a, T b) const {
-    return fmod(b + fmod(a, b), b);
+    T res = fmod(a, b);
+    if ((res != 0) && ((b < 0) != (res < 0))) res += b;
+    return res;
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
old mode 100644
new mode 100755
index ff55d2f2040a17c32720df08c1ac0b00cc1d7a02..a910c326196bc61758c3be7db3b8ac5d85b0095c
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
@@ -22,15 +22,20 @@ namespace operators {
 template <typename T>
 struct PowFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const {
-#ifdef __CUDA_ARCH__
-    // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
-    // it will return a float number like 2.99... , which floor to 2
-    // when cast to int by default and it is wrong.
-    // Use llrint to cast it to the nearest integer, which is 3.
+    // TODO(wujionghao): A potential speed improvement is supporting different
+    // types in C++.
+    // #ifdef __CUDA_ARCH__
+    //     // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
+    //     // it will return a float number like 2.99... , which floor to 2
+    //     // when cast to int by default and it is wrong.
+    //     // Use llrint to cast it to the nearest integer, which is 3.
+    //     if (std::is_integral<T>::value) {
+    //       return std::llrint(std::pow(a, b));
+    //     }
+    // #endif
     if (std::is_integral<T>::value) {
       return std::llrint(std::pow(a, b));
     }
-#endif
     return std::pow(a, b);
   }
 };
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 8a3450d1df97a2e99711f9ae029ca2668f38e2b0..28afeb6f541c68fe7e0719a782fd8c9147b15163 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
-
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace operators {
 
@@ -152,3 +152,7 @@ REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                        ops::GatherGradientOpKernel<int>,
                        ops::GatherGradientOpKernel<uint8_t>,
                        ops::GatherGradientOpKernel<int64_t>);
+REGISTER_OP_VERSION(gather)
+    .AddCheckpoint(R"ROC(upgrad gather, add attribut [axis])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewAttr(
+                       "axis", "Specify the axis of gather operation.", {}));
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 111d4ad4490074fb53671f6f3180cf17c5abe913..4f128463375b91803a7a4d02a27dd78157961aac 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -39,26 +39,14 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
     tensor->Resize(shape);
     int64_t size = tensor->numel();
     T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
 
-    if (framework::Generator::GetInstance()->is_init_py) {
-      std::mt19937_64& gen_engine =
-          framework::Generator::GetInstance()->GetCPUEngine();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(gen_engine);
-      }
-    } else {
-      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-      std::minstd_rand engine;
-      if (seed == 0) {
-        seed = std::random_device()();
-      }
-      engine.seed(seed);
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
     }
   }
-};
+};  // namespace operators
 
 template <typename T>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12733a0d9f1689a020f77d23cc31b0d19b412746
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -0,0 +1,695 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/interpolate_v2_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
+  auto dim_x = ctx->GetInputDim("X");
+  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
+
+  PADDLE_ENFORCE_EQ("linear", interp_method,
+                    platform::errors::InvalidArgument(
+                        "Interpolation method can only be \"linear\" when"
+                        "Input(X) dimension is 3, but got method = %s .",
+                        interp_method));
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+
+  if (ctx->HasInputs("SizeTensor")) {
+    // top prority size
+    auto inputs_name = ctx->Inputs("SizeTensor");
+    PADDLE_ENFORCE_EQ(
+        inputs_name.size(), 1,
+        platform::errors::InvalidArgument(
+            "Input(SizeTensor)'size of Op(interpolate) must be 1. "
+            "Attr(out_shape)'s length must be 1 for 3-D input tensor, but got "
+            "size = %d .",
+            inputs_name.size()));
+    int out_w = ctx->Attrs().Get<int>("out_w");
+    framework::DDim dim_out;
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {dim_x[0], dim_x[1], out_w};
+    } else {
+      dim_out = {dim_x[0], out_w, dim_x[2]};
+    }
+    ctx->SetOutputDim("Out", dim_out);
+
+    return;
+  }
+
+  int out_w;
+  if (ctx->HasInput("Scale")) {
+    auto scale_tensor = ctx->GetInputDim("Scale");
+    PADDLE_ENFORCE_EQ(
+        scale_tensor.size(), 1,
+        platform::errors::InvalidArgument(
+            "Scale's dimension size must be 1, but got dimension = %d .",
+            scale_tensor.size()));
+    PADDLE_ENFORCE_EQ(
+        scale_tensor[0], 1,
+        platform::errors::InvalidArgument(
+            "Scale's shape must be 1, but got shape = %d .", scale_tensor[0]));
+    // out_w = -1;
+  } else {
+    auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
+    if (scale.size() > 0) {
+      float scale_w = -1;
+      scale_w = scale[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+      if (scale_w > 0.) {
+        // round down
+        out_w = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[2] * scale_w)
+                     : static_cast<int>(dim_x[1] * scale_w));
+        // protect when input shape is -1
+        out_w = out_w > 0 ? out_w : -1;
+      }
+    } else {
+      out_w = ctx->Attrs().Get<int>("out_w");
+    }
+  }
+
+  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
+    auto out_size_dim = ctx->GetInputDim("OutSize");
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got dimention = %d .",
+            out_size_dim.size()));
+    PADDLE_ENFORCE_EQ(out_size_dim[0], 1, platform::errors::InvalidArgument(
+                                              "OutSize's dim[0] must be 1"));
+    ctx->ShareLoD("X", "Out");
+    return;
+  }
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {dim_x[0], dim_x[1], out_w};
+  } else {
+    dim_out = {dim_x[0], out_w, dim_x[2]};
+  }
+  ctx->SetOutputDim("Out", dim_out);
+}
+
+static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
+  auto dim_x = ctx->GetInputDim("X");
+  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
+
+  PADDLE_ENFORCE(
+      "bilinear" == interp_method || "nearest" == interp_method ||
+          "bicubic" == interp_method,
+      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
+      "Input(X) dimension is 4, but got method = %s .",
+      interp_method);
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+
+  if (ctx->HasInputs("SizeTensor")) {
+    // top prority size
+    auto inputs_name = ctx->Inputs("SizeTensor");
+    PADDLE_ENFORCE_EQ(
+        inputs_name.size(), 2,
+        platform::errors::InvalidArgument(
+            "Input(SizeTensor)'size of Op(interpolate) must be 2. "
+            "Attr(out_shape)'s length must be 2 for 4-D input "
+            "tensor, but got size = %d .",
+            inputs_name.size()));
+    int out_h = ctx->Attrs().Get<int>("out_h");
+    int out_w = ctx->Attrs().Get<int>("out_w");
+    framework::DDim dim_out;
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {dim_x[0], dim_x[1], out_h, out_w};
+    } else {
+      dim_out = {dim_x[0], out_h, out_w, dim_x[3]};
+    }
+    ctx->SetOutputDim("Out", dim_out);
+
+    return;
+  }
+
+  int out_h, out_w;
+  if (ctx->HasInput("Scale")) {
+    auto scale_tensor = ctx->GetInputDim("Scale");
+    PADDLE_ENFORCE_EQ(
+        scale_tensor.size(), 1,
+        platform::errors::InvalidArgument(
+            "Scale's dimension size must be 1, but got dimension = %d .",
+            scale_tensor.size()));
+    PADDLE_ENFORCE_EQ(scale_tensor[0] == 2 || scale_tensor[0] == 1, true,
+                      platform::errors::InvalidArgument(
+                          "Scale's shape must be 2 or 1, but got shape = %d .",
+                          scale_tensor[0]));
+    // out_h = -1;
+    // out_w = -1;
+  } else {
+    auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
+    if (scale.size() > 0) {
+      float scale_h = -1;
+      float scale_w = -1;
+      scale_h = scale[0];
+      scale_w = scale[1];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+      if (scale_h > 0. && scale_w > 0.) {
+        // round down
+        out_h = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[2] * scale_h)
+                     : static_cast<int>(dim_x[1] * scale_h));
+        out_w = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[3] * scale_w)
+                     : static_cast<int>(dim_x[2] * scale_w));
+        // protect when input shape is -1
+        out_h = out_h > 0 ? out_h : -1;
+        out_w = out_w > 0 ? out_w : -1;
+      }
+    } else {
+      out_h = ctx->Attrs().Get<int>("out_h");
+      out_w = ctx->Attrs().Get<int>("out_w");
+    }
+  }
+
+  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
+    auto out_size_dim = ctx->GetInputDim("OutSize");
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got dimension = %d .",
+            out_size_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        out_size_dim[0], 2,
+        platform::errors::InvalidArgument(
+            "OutSize's dim[0] must be 2, but got dimention = %d .",
+            out_size_dim[0]));
+    ctx->ShareLoD("X", "Out");
+    return;
+  }
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {dim_x[0], dim_x[1], out_h, out_w};
+  } else {
+    dim_out = {dim_x[0], out_h, out_w, dim_x[3]};
+  }
+  ctx->SetOutputDim("Out", dim_out);
+}
+
+static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
+  auto dim_x = ctx->GetInputDim("X");
+  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
+
+  PADDLE_ENFORCE_EQ(
+      "trilinear", interp_method,
+      platform::errors::InvalidArgument(
+          "Interpolation method can only be \"trilinear\" when Input(X) "
+          "dimension is 5, but got method = %s .",
+          interp_method));
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+
+  if (ctx->HasInputs("SizeTensor")) {
+    // top prority size
+    auto inputs_name = ctx->Inputs("SizeTensor");
+    PADDLE_ENFORCE_EQ(
+        inputs_name.size(), 3,
+        platform::errors::InvalidArgument(
+            "Input(SizeTensor)'s size of Op(interpolate) must be 3. "
+            "Attr(out_shape)'s length must be 3 for 5-D input "
+            "tensor, but got size = %d .",
+            inputs_name.size()));
+    int out_d = ctx->Attrs().Get<int>("out_d");
+    int out_h = ctx->Attrs().Get<int>("out_h");
+    int out_w = ctx->Attrs().Get<int>("out_w");
+    framework::DDim dim_out;
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w};
+    } else {
+      dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]};
+    }
+    ctx->SetOutputDim("Out", dim_out);
+
+    return;
+  }
+
+  int out_d, out_h, out_w;
+  if (ctx->HasInput("Scale")) {
+    auto scale_tensor = ctx->GetInputDim("Scale");
+    PADDLE_ENFORCE_EQ(
+        scale_tensor.size(), 1,
+        platform::errors::InvalidArgument(
+            "Scale's dimension size must be 1, but got size = %d .",
+            scale_tensor.size()));
+    PADDLE_ENFORCE_EQ(scale_tensor[0] == 3 || scale_tensor[0] == 1, true,
+                      platform::errors::InvalidArgument(
+                          "Scale's shape must be 3 or 1, but got shape = %d .",
+                          scale_tensor[0]));
+    // out_d = -1;
+    // out_h = -1;
+    // out_w = -1;
+  } else {
+    auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
+    if (scale.size() > 0) {
+      float scale_d = -1;
+      float scale_h = -1;
+      float scale_w = -1;
+      scale_d = scale[0];
+      scale_h = scale[1];
+      scale_w = scale[2];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+      if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+        // round down
+        out_d = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[2] * scale_d)
+                     : static_cast<int>(dim_x[1] * scale_d));
+        out_h = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[3] * scale_h)
+                     : static_cast<int>(dim_x[2] * scale_h));
+        out_w = (data_layout == DataLayout::kNCHW
+                     ? static_cast<int>(dim_x[4] * scale_w)
+                     : static_cast<int>(dim_x[3] * scale_w));
+        // protect when input shape is -1
+        out_d = out_d > 0 ? out_d : -1;
+        out_h = out_h > 0 ? out_h : -1;
+        out_w = out_w > 0 ? out_w : -1;
+      }
+    } else {
+      out_d = ctx->Attrs().Get<int>("out_d");
+      out_h = ctx->Attrs().Get<int>("out_h");
+      out_w = ctx->Attrs().Get<int>("out_w");
+    }
+  }
+
+  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
+    auto out_size_dim = ctx->GetInputDim("OutSize");
+    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
+                      "OutSize's dimension size must be 1, but got size =%d .",
+                      out_size_dim.size());
+    PADDLE_ENFORCE_EQ(out_size_dim[0], 3,
+                      "OutSize's dim[0] must be 3, but got size = %d .",
+                      out_size_dim[0]);
+    ctx->ShareLoD("X", "Out");
+    return;
+  }
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w};
+  } else {
+    dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]};
+  }
+  ctx->SetOutputDim("Out", dim_out);
+}
+
+class InterpolateV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of InterpolateV2Op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of InterpolationOp should not be null.");
+
+    auto dim_x = ctx->GetInputDim("X");  // NCHW format
+    PADDLE_ENFORCE(
+        dim_x.size() == 3 || dim_x.size() == 4 || dim_x.size() == 5,
+        platform::errors::Unimplemented(
+            "Input(X) dimension must be 3, 4 or 5, but got dimension = %d .",
+            dim_x.size()));
+
+    if (dim_x.size() == 3) {
+      // shape check for 1D interpolate for input tensor shape NCHW
+      Interpolate1DInferShapeCheck(ctx);
+    } else if (dim_x.size() == 4) {
+      // shape check for 2D interpolate for input tensor shape NCHW
+      Interpolate2DInferShapeCheck(ctx);
+    } else {  // dim_x.size() == 5
+      // shape check for 3D interpolate for input tensor shape NCDHW
+      Interpolate3DInferShapeCheck(ctx);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "SizeTensor" || var_name == "Scale") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of interpolate operator, "
+             "This is a 4-D tensor with shape of [N, C, H, W] or a "
+             "5-D tensor with shape of [N, C, D, H, W].");
+    AddInput("OutSize",
+             "This is a 1-D tensor with two numbers to specify output size. "
+             "It should be [output_height, output_width] when input is a 4-D "
+             "tensor and should be [output_depth, output_height, output_width] "
+             "when input is a 5-D tensor. It has a higher priority than "
+             "the attr(out_d), attr(out_h), attr(out_w) and attr(scale).")
+        .AsDispensable();
+    AddInput("SizeTensor",
+             "(vector<Tensor<int32>>, optional). If provided, interpolate will "
+             "use this. The shape of the tensor in vector MUST BE [1]. "
+             "It has the highest priority compare with Input(OutSize) and "
+             "attr(out_d), attr(out_h), attr(out_w) and attr(scale).")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("Scale",
+             "This is a 1-D tensor with one number to specify output scale. "
+             "It has the higher priority compare with attr(scale).")
+        .AsDispensable();
+    AddOutput("Out",
+              "The output tensor of interpolate operator, "
+              "This is a tensor in same rank with Input(X).");
+
+    AddAttr<std::string>(
+        "data_layout",
+        "(string, default NCHW) Only used in "
+        "an optional string from: \"NHWC\", \"NCHW\". "
+        "Specify that the data format of the input and output data is "
+        "channel_first or channel_last.")
+        .SetDefault("NCHW");
+    AddAttr<int>("out_d", "output depth of interpolate op.").SetDefault(0);
+    AddAttr<int>("out_h", "output height of interpolate op.").SetDefault(0);
+    AddAttr<int>("out_w", "output width of interpolate op.").SetDefault(0);
+    AddAttr<std::vector<float>>("scale", "scale_d factor of interpolate op.")
+        .SetDefault(std::vector<float>{});
+    AddAttr<std::string>("interp_method",
+                         "(string, default \"bilinear\"), interpolation "
+                         "method, can be \"linear\" for linear interpolation"
+                         ",\"bilinear\" for "
+                         "bilinear interpolation, \"trilinear\" for trilinear "
+                         "interpolation and \"nearest\" for nearest "
+                         "neighbor interpolation, and \"bicubic\" for bicubic"
+                         "interpolation.")
+        .SetDefault("bilinear");
+    AddAttr<bool>(
+        "align_corners",
+        "an optional bool. Defaults to True. "
+        "If True, the centers of 4 corner pixels of the input and output "
+        "tensors are aligned, preserving the values at the corner pixels, "
+        "If False, are not aligned")
+        .SetDefault(true);
+    AddAttr<int>("align_mode",
+                 "(int, default \'1\'), optional for bilinear interpolation, "
+                 "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
+                 "can be \'1\' for src_idx = scale*dst_index .")
+        .SetDefault(1);
+    AddComment(R"DOC(
+          This operator samples input X to given output shape by using specified
+          interpolation method, the interpolation methods can be \"nearest\"
+          for nearest neighbor interpolation and \"bilinear\" for bilinear 
+          interpolation and \"linear\" for linear interpolation..
+
+          Nearest neighbor interpolation is to perform nearest neighbor interpolation
+          in both the 3rd dimension(in height direction) and the 4th dimension(in width 
+          direction) on input tensor.
+           
+          Linear interpolation is the method of using a line connecting two known quantities 
+          to determine the value of an unknown quantity between the two known quantities. 
+          
+          Bilinear interpolation is an extension of linear interpolation for 
+          interpolating functions of two variables (e.g. H-direction and 
+          W-direction in this op) on a rectilinear 2D grid. The key idea is 
+          to perform linear interpolation first in one direction, and then 
+          again in the other direction.
+
+          Trilinear interpolation is an extension of linear interpolation for 
+          interpolating functions of three variables (e.g. D-direction, 
+          H-direction and W-direction in this op) on a rectilinear 3D grid. 
+          The linear interpolation is performed on three directions.
+
+          Bicubic interpolation is an extension of cubic interpolation for interpolating
+          data points on a two-dimensional regular grid. The interpolated surface is
+          smoother than corresponding surfaces obtained by bilinear interpolation or
+          nearest-neighbor interpolation.
+
+          Align_corners and align_mode are optional parameters,the calculation method 
+          of interpolation can be selected by them.
+          
+          Example:
+
+          For scale:
+          
+            if align_corners = True and out_{size}>1 :
+
+              scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0)
+            
+            else:
+              
+              scale_{factor} = float(in_{size}/out_{size})
+            
+          
+          Nearest neighbor interpolation:
+          
+          if:
+              align_corners = False
+
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
+              W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+
+          else:
+              align_corners = True
+
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})
+
+          Bilinear interpolation:
+
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+          else:
+           
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+
+          Trilinear interpolation:
+
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+              
+              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+          else:
+           
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+
+              D_out = D_{in} * scale_{factor}
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+
+          Bicubic interpolation:
+
+          if:
+              align_corners = False
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+
+          For details of nearest neighbor interpolation, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
+
+          For details of bilinear interpolation, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Bilinear_interp_v2olation
+
+          For details of trilinear interpolation, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Trilinear_interp_v2olation
+
+          For details of bicubic interpolation, please refer to Wikipedia:
+          https://en.wikipedia.org/wiki/Bicubic_interpolation
+         )DOC");
+  }
+};
+
+class InterpolateV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto dim_x = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "SizeTensor" || var_name == "Scale") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class InterpolateV2GradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    if (this->HasInput("SizeTensor") > 0) {
+      op->SetInput("SizeTensor", this->Input("SizeTensor"));
+    }
+    if (this->HasInput("OutSize") > 0) {
+      op->SetInput("OutSize", this->Input("OutSize"));
+    }
+    if (this->HasInput("Scale") > 0) {
+      op->SetInput("Scale", this->Input("Scale"));
+    }
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateV2GradNoNeedBufferVarsInferer,
+                                    "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(bilinear_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(bilinear_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OPERATOR(nearest_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(nearest_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OPERATOR(trilinear_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(trilinear_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OPERATOR(bicubic_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(bicubic_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OP_CPU_KERNEL(bilinear_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OP_CPU_KERNEL(nearest_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(nearest_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OP_CPU_KERNEL(trilinear_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(trilinear_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OPERATOR(linear_interp_v2, ops::InterpolateV2Op,
+                  ops::InterpolateV2OpMaker,
+                  ops::InterpolateV2GradMaker<paddle::framework::OpDesc>,
+                  ops::InterpolateV2GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(linear_interp_v2_grad, ops::InterpolateV2OpGrad,
+                  ops::InterpolateV2GradNoNeedBufferVarsInferer);
+REGISTER_OP_CPU_KERNEL(linear_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(linear_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
+REGISTER_OP_CPU_KERNEL(bicubic_interp_v2, ops::InterpolateV2Kernel<float>,
+                       ops::InterpolateV2Kernel<double>);
+REGISTER_OP_CPU_KERNEL(bicubic_interp_v2_grad,
+                       ops::InterpolateV2GradKernel<float>,
+                       ops::InterpolateV2GradKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6cb8104638dea458743374014e7bef35df2dbfcc
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -0,0 +1,1578 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <algorithm>
+#include <string>
+#include "paddle/fluid/operators/interpolate_v2_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+__global__ void KeNearestNeighborInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+
+    if (data_layout == DataLayout::kNCHW) {
+      out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
+                    in_img_idy * in_img_w + in_img_idx];
+    } else {
+      out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                    in_img_idx * num_channels + channel_id];
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeNearestNeighborInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+
+    T* in_pos;
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                   in_img_idy * in_img_w + in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                   in_img_idx * num_channels + channel_id];
+    }
+    const T out_pos = out[out_id_h * output_w + out_id_w];
+    platform::CudaAtomicAdd(in_pos, out_pos);
+  }
+}
+
+template <typename T>
+__global__ void KeLinearInterpFw(const T* in, const size_t in_img_w,
+                                 const size_t input_w, T* out,
+                                 const size_t out_img_w, const size_t output_h,
+                                 const size_t output_w,
+                                 const size_t num_channels, const float ratio_w,
+                                 const bool align_corners, const int align_mode,
+                                 const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;  // w
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;  // w_id
+
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      const T* in_pos =
+          &in[out_id_h * out_id_w + channel_id * in_img_size + in_img_idx];
+      // linear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          w2lambda * in_pos[0] + w1lambda * in_pos[w_id];
+
+    } else {
+      const T* in_pos =
+          &in[out_id_h * input_w + in_img_idx * num_channels + channel_id];
+      // linear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels];
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeLinearInterpBw(T* in, const size_t in_img_w,
+                                 const size_t input_w, const T* out,
+                                 const size_t out_img_w, const size_t output_h,
+                                 const size_t output_w,
+                                 const size_t num_channels, const T ratio_w,
+                                 const bool align_corners, const int align_mode,
+                                 const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
+                                : ratio_w * out_img_idx;
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;  // w
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;  // w_id
+
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    T* in_pos;
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size + in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w + in_img_idx * num_channels + channel_id];
+    }
+    const T* out_pos = &out[out_id_w];
+
+    if (data_layout == DataLayout::kNCHW) {
+      platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
+    } else {
+      platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                              w1lambda * out_pos[0]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeBilinearInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = align_flag
+                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                            in_img_idy * in_img_w + in_img_idx];
+
+      // bilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
+          h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
+                      w1lambda * in_pos[h_id * in_img_w + w_id]);
+    } else {
+      const T* in_pos =
+          &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+              in_img_idx * num_channels + channel_id];
+
+      // bilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          h2lambda *
+              (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) +
+          h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] +
+                      w1lambda * in_pos[h_id * in_img_w * num_channels +
+                                        w_id * num_channels]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeBilinearInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const T ratio_h, const T ratio_w,
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5
+                                : ratio_h * out_img_idy;
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
+                                : ratio_w * out_img_idx;
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    T* in_pos;
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                   in_img_idy * in_img_w + in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                   in_img_idx * num_channels + channel_id];
+    }
+
+    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+
+    if (data_layout == DataLayout::kNCHW) {
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
+                              h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
+                              h1lambda * w1lambda * out_pos[0]);
+    } else {
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                              h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
+                              h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
+          h1lambda * w1lambda * out_pos[0]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeTrilinearInterpFw(
+    const T* in, const size_t in_img_d, const size_t in_img_h,
+    const size_t in_img_w, const size_t input_h, const size_t input_w, T* out,
+    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
+    const size_t output_h, const size_t output_w, const size_t num_channels,
+    const float ratio_d, const float ratio_h, const float ratio_w,
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idt, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
+      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
+                    (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idt = align_flag
+                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * out_img_idt);
+    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
+    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
+    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
+    src_d = (src_d > 0) ? src_d : 0;
+    T d1lambda =
+        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
+    T d2lambda = 1.f - d1lambda;
+
+    int in_img_idy = align_flag
+                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
+                        (in_img_idt * in_img_h + in_img_idy) * in_img_w +
+                        in_img_idx;
+      const T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
+      const T* in_pos2 = &in[in_pos2_idx];
+
+      // trilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          d2lambda *
+              (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) +
+               h1lambda * (w2lambda * in_pos1[h_id * in_img_w] +
+                           w1lambda * in_pos1[h_id * in_img_w + w_id])) +
+          d1lambda *
+              (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) +
+               h1lambda * (w2lambda * in_pos2[h_id * in_img_w] +
+                           w1lambda * in_pos2[h_id * in_img_w + w_id]));
+
+    } else {
+      int in_pos1_idx = out_id_h * input_w +
+                        in_img_idt * in_img_h * in_img_w * num_channels +
+                        in_img_idy * in_img_w * num_channels +
+                        in_img_idx * num_channels + channel_id;
+      const T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels;
+      const T* in_pos2 = &in[in_pos2_idx];
+
+      // trilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          d2lambda *
+              (h2lambda * (w2lambda * in_pos1[0] +
+                           w1lambda * in_pos1[w_id * num_channels]) +
+               h1lambda * (w2lambda * in_pos1[h_id * in_img_w * num_channels] +
+                           w1lambda * in_pos1[h_id * in_img_w * num_channels +
+                                              w_id * num_channels])) +
+          d1lambda *
+              (h2lambda * (w2lambda * in_pos2[0] +
+                           w1lambda * in_pos2[w_id * num_channels]) +
+               h1lambda * (w2lambda * in_pos2[h_id * in_img_w * num_channels] +
+                           w1lambda * in_pos2[h_id * in_img_w * num_channels +
+                                              w_id * num_channels]));
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeTrilinearInterpBw(
+    T* in, const size_t in_img_d, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, const T* out,
+    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
+    const size_t output_h, const size_t output_w, const size_t num_channels,
+    const T ratio_d, const T ratio_h, const T ratio_w, const bool align_corners,
+    const int align_mode, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idt, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
+      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
+                    (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idt = align_flag
+                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * out_img_idt);
+    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
+    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
+    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
+    src_d = (src_d > 0) ? src_d : 0;
+    T d1lambda =
+        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
+    T d2lambda = 1.f - d1lambda;
+
+    int in_img_idy = align_flag
+                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
+    src_h = (src_h > 0) ? src_h : 0;
+    T h1lambda =
+        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    T w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    if (data_layout == DataLayout::kNCHW) {
+      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
+                        (in_img_idt * in_img_h + in_img_idy) * in_img_w +
+                        in_img_idx;
+      T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
+      T* in_pos2 = &in[in_pos2_idx];
+
+      const T* out_pos = &out[out_id_h * output_w + out_id_w];
+
+      // trilinear interpolation grad
+      platform::CudaAtomicAdd(&in_pos1[0],
+                              d2lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[w_id],
+                              d2lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
+                              d2lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
+                              d2lambda * h1lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[0],
+                              d1lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[w_id],
+                              d1lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
+                              d1lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
+                              d1lambda * h1lambda * w1lambda * out_pos[0]);
+    } else {
+      int in_pos1_idx = out_id_h * input_w +
+                        in_img_idt * in_img_h * in_img_w * num_channels +
+                        in_img_idy * in_img_w * num_channels +
+                        in_img_idx * num_channels + channel_id;
+      T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels;
+      T* in_pos2 = &in[in_pos2_idx];
+
+      const T* out_pos = &out[out_id_h * output_w + out_id_w];
+
+      // trilinear interpolation grad
+      platform::CudaAtomicAdd(&in_pos1[0],
+                              d2lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[w_id * num_channels],
+                              d2lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
+                              d2lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels],
+          d2lambda * h1lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[0],
+                              d1lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[w_id * num_channels],
+                              d1lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
+                              d1lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels],
+          d1lambda * h1lambda * w1lambda * out_pos[0]);
+    }
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ static T Kecubic_interp(const T x0, const T x1,
+                                                   const T x2, const T x3,
+                                                   T t) {
+  T coeffs[4];
+  T a = -0.75;
+  T x_1 = t;
+  T x_2 = 1.0 - t;
+  coeffs[0] = cubic_convolution2<T>(x_1 + 1.0, a);
+  coeffs[1] = cubic_convolution1<T>(x_1, a);
+  coeffs[2] = cubic_convolution1<T>(x_2, a);
+  coeffs[3] = cubic_convolution2<T>(x_2 + 1.0, a);
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+template <typename T>
+__global__ void KeBicubicInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    T in_img_idy = align_corners
+                       ? static_cast<T>(ratio_h * out_img_idy)
+                       : static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
+    int input_y = floorf(in_img_idy);
+    const T y_t = in_img_idy - input_y;
+
+    T in_img_idx = align_corners
+                       ? static_cast<T>(ratio_w * out_img_idx)
+                       : static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+    int input_x = floorf(in_img_idx);
+    const T x_t = in_img_idx - input_x;
+
+    T coefficients[4];
+    const T* in_pos_0;
+    const T* in_pos_1;
+    const T* in_pos_2;
+    const T* in_pos_3;
+    int access_x_0;
+    if (data_layout == DataLayout::kNCHW) {
+      for (int k = 0; k < 4; k++) {
+        int access_y =
+            max(min(input_y - 1 + k, static_cast<int>(in_img_h - 1)), 0);
+        access_x_0 = max(min(input_x - 1, static_cast<int>(in_img_w - 1)), 0);
+        int access_x_1 =
+            max(min(input_x + 0, static_cast<int>(in_img_w - 1)), 0);
+        int access_x_2 =
+            max(min(input_x + 1, static_cast<int>(in_img_w - 1)), 0);
+        int access_x_3 =
+            max(min(input_x + 2, static_cast<int>(in_img_w - 1)), 0);
+
+        in_pos_0 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_0];
+        in_pos_1 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_1];
+        in_pos_2 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_2];
+        in_pos_3 = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x_3];
+
+        coefficients[k] = Kecubic_interp<T>(in_pos_0[0], in_pos_1[0],
+                                            in_pos_2[0], in_pos_3[0], x_t);
+      }
+
+      out[out_id_h * output_w + out_id_w] =
+          Kecubic_interp<T>(coefficients[0], coefficients[1], coefficients[2],
+                            coefficients[3], y_t);
+
+    } else {
+      for (int k = 0; k < 4; k++) {
+        int access_y =
+            max(min(input_y - 1 + k, static_cast<int>((in_img_h - 1))), 0);
+        int access_x_0 =
+            max(min(input_x - 1, static_cast<int>((in_img_w - 1))), 0);
+        int access_x_1 =
+            max(min(input_x + 0, static_cast<int>((in_img_w - 1))), 0);
+        int access_x_2 =
+            max(min(input_x + 1, static_cast<int>((in_img_w - 1))), 0);
+        int access_x_3 =
+            max(min(input_x + 2, static_cast<int>((in_img_w - 1))), 0);
+
+        const T* in_pos_0 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_0 * num_channels + channel_id];
+        const T* in_pos_1 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_1 * num_channels + channel_id];
+        const T* in_pos_2 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_2 * num_channels + channel_id];
+        const T* in_pos_3 =
+            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                access_x_3 * num_channels + channel_id];
+
+        coefficients[k] = Kecubic_interp(in_pos_0[0], in_pos_1[0], in_pos_2[0],
+                                         in_pos_3[0], x_t);
+      }
+
+      out[out_id_h * output_w + out_id_w] =
+          static_cast<T>(Kecubic_interp(coefficients[0], coefficients[1],
+                                        coefficients[2], coefficients[3], y_t));
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeBicubicInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    T in_img_idy = align_corners
+                       ? static_cast<T>(ratio_h * out_img_idy)
+                       : static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
+    int input_y = floorf(in_img_idy);
+    const T y_t = in_img_idy - input_y;
+
+    T in_img_idx = align_corners
+                       ? static_cast<T>(ratio_w * out_img_idx)
+                       : static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+    int input_x = floorf(in_img_idx);
+
+    const T x_t = in_img_idx - input_x;
+
+    T x_coeffs[4];
+    T y_coeffs[4];
+
+    get_cubic_upsample_coefficients(x_coeffs, x_t);
+    get_cubic_upsample_coefficients(y_coeffs, y_t);
+
+    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+    T* in_pos;
+
+    for (int i = 0; i < 4; i++) {
+      for (int j = 0; j < 4; j++) {
+        int access_y = max(min(static_cast<int>(input_y - 1 + j),
+                               static_cast<int>(in_img_h - 1)),
+                           0);
+        int access_x = max(min(static_cast<int>(input_x - 1 + i),
+                               static_cast<int>(in_img_w - 1)),
+                           0);
+        if (data_layout == DataLayout::kNCHW) {
+          in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                       access_y * in_img_w + access_x];
+        } else {
+          in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels +
+                       access_x * num_channels + channel_id];
+        }
+        platform::CudaAtomicAdd(&in_pos[0],
+                                (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
+      }
+    }
+  }
+}
+
+template <typename T>
+static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
+                                 const Tensor& input, Tensor* output) {
+  auto* input_data = input.data<T>();
+
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_w = new_size[0];
+  } else {
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale_w = scale_data[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    } else {
+      if (scale.size() > 0) {
+        scale_w = scale[0];
+        PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                                 "scale  of Op(interpolate) "
+                                                 "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0.) {
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_w = size_data[0];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_w};
+  } else {
+    dim_out = {n, out_w, c};
+  }
+  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1.0) / (out_w - 1.0)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_cw = c * in_w;
+  int out_cw = c * out_w;
+  int pixelNum = n * out_cw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("linear" == interp_method) {
+    KeLinearInterpFw<T><<<config.blocks, config.threads, 0,
+                          ctx.cuda_device_context().stream()>>>(
+        input_data, in_w, in_cw, output_data, out_w, n, out_cw, c, ratio_w,
+        align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
+                                 const Tensor& input, Tensor* output) {
+  auto* input_data = input.data<T>();
+
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else {
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_h = scale_data[0];
+        scale_w = scale_data[1];
+      } else {
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_w = scale[1];
+        scale_h = scale[0];
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0. && scale_h > 0.) {
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_h, out_w};
+  } else {
+    dim_out = {n, out_h, out_w, c};
+  }
+  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_hw = in_h * in_w;
+  int out_hw = out_h * out_w;
+  int in_chw = c * in_hw;
+  int out_chw = c * out_hw;
+
+  int pixelNum = n * out_chw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("nearest" == interp_method) {
+    KeNearestNeighborInterpFw<T><<<config.blocks, config.threads, 0,
+                                   ctx.cuda_device_context().stream()>>>(
+        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  } else if ("bilinear" == interp_method) {
+    KeBilinearInterpFw<T><<<config.blocks, config.threads, 0,
+                            ctx.cuda_device_context().stream()>>>(
+        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+        out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
+  } else if ("bicubic" == interp_method) {
+    KeBicubicInterpFw<
+        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
+                                 const Tensor& input, Tensor* output) {
+  auto* input_data = input.data<T>();
+
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  } else {
+    float scale_d = -1;
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_d = scale_data[0];
+        scale_h = scale_data[1];
+        scale_w = scale_data[2];
+      } else {
+        scale_d = scale_data[0];
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_d = scale[0];
+        scale_h = scale[1];
+        scale_w = scale[2];
+
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+      out_d = static_cast<int>(in_d * scale_d);
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_d = size_data[0];
+      out_h = size_data[1];
+      out_w = size_data[2];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument(
+                                  "out_d in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_d, out_h, out_w};
+  } else {
+    dim_out = {n, out_d, out_h, out_w, c};
+  }
+  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_dhw = in_d * in_h * in_w;
+  int out_dhw = out_d * out_h * out_w;
+  int in_cdhw = c * in_dhw;
+  int out_cdhw = c * out_dhw;
+
+  int pixelNum = n * out_cdhw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("trilinear" == interp_method) {
+    KeTrilinearInterpFw<T><<<config.blocks, config.threads, 0,
+                             ctx.cuda_device_context().stream()>>>(
+        input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
+        out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
+        align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
+                                 Tensor* input_grad, const Tensor output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    scale_w = scale_data[0];
+    PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                             "scale  of Op(interpolate) "
+                                             "should be greater than 0."));
+  } else {
+    if (scale.size() > 0) {
+      scale_w = scale[0];
+
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    }
+  }
+  if (scale_w > 0.) {
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    Tensor sizes;
+    framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+    auto size_data = sizes.data<int>();
+    out_w = size_data[0];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_w = new_size[0];
+  }
+
+  auto* output_grad_data = output_grad.data<T>();
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_w};
+  } else {
+    dim_grad = {n, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+  int in_cw = c * in_w;
+  int out_cw = c * out_w;
+  int pixelNum = n * out_cw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("linear" == interp_method) {
+    KeLinearInterpBw<T><<<config.blocks, config.threads, 0,
+                          ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_w, in_cw, output_grad_data, out_w, n, out_cw, c,
+        ratio_w, align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
+                                 Tensor* input_grad, const Tensor output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_h = scale_data[0];
+      scale_w = scale_data[1];
+    } else {
+      scale_h = scale_data[0];
+      scale_w = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_w = scale[1];
+      scale_h = scale[0];
+
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_w > 0. && scale_h > 0.) {
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    Tensor sizes;
+    framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+    auto size_data = sizes.data<int>();
+    out_h = size_data[0];
+    out_w = size_data[1];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  }
+
+  auto* output_grad_data = output_grad.data<T>();
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_h, in_w};
+  } else {
+    dim_grad = {n, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_hw = in_h * in_w;
+  int out_hw = out_h * out_w;
+  int in_chw = c * in_hw;
+  int out_chw = c * out_hw;
+
+  int pixelNum = n * out_chw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("nearest" == interp_method) {
+    KeNearestNeighborInterpBw<T><<<config.blocks, config.threads, 0,
+                                   ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
+        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  } else if ("bilinear" == interp_method) {
+    KeBilinearInterpBw<T><<<config.blocks, config.threads, 0,
+                            ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
+        n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
+        data_layout);
+  } else if ("bicubic" == interp_method) {
+    KeBicubicInterpBw<
+        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
+        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
+                                 Tensor* input_grad,
+                                 const Tensor& output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_d = -1;
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_d = scale_data[0];
+      scale_h = scale_data[1];
+      scale_w = scale_data[2];
+    } else {
+      scale_d = scale_data[0];
+      scale_h = scale_data[0];
+      scale_w = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_d = scale[0];
+      scale_h = scale[1];
+      scale_w = scale[2];
+
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+    out_d = static_cast<int>(in_d * scale_d);
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    Tensor sizes;
+    framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
+    auto size_data = sizes.data<int>();
+    out_d = size_data[0];
+    out_h = size_data[1];
+    out_w = size_data[2];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  }
+
+  auto* output_grad_data = output_grad.data<T>();
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_d, in_h, in_w};
+  } else {
+    dim_grad = {n, in_d, in_h, in_w, c};
+  }
+  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  int in_dhw = in_d * in_h * in_w;
+  int out_dhw = out_d * out_h * out_w;
+  int in_cdhw = c * in_dhw;
+  int out_cdhw = c * out_dhw;
+
+  int pixelNum = n * out_cdhw;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
+
+  if ("trilinear" == interp_method) {
+    KeTrilinearInterpBw<T><<<config.blocks, config.threads, 0,
+                             ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
+        out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
+        align_mode, data_layout);
+  }
+}
+
+template <typename T>
+class InterpolateOpV2CUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::NotFound("This kernel only runs on GPU device."));
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto input_dims = input->dims();
+    if (input_dims.size() == 3) {  // 1D interpolation
+      Interpolate1DCUDAFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 4) {  // 2D interpolation
+      Interpolate2DCUDAFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 5) {  // 3D interpolation
+      Interpolate3DCUDAFwd<T>(ctx, *input, output);
+    }
+  }
+};
+
+template <typename T>
+class InterpolateV2GradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::NotFound("This kernel only runs on GPU device."));
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto output_grad_dims = output_grad->dims();
+    if (output_grad_dims.size() == 3) {  // 1D interpolation
+      Interpolate1DCUDABwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 4) {  // 2D interpolation
+      Interpolate2DCUDABwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 5) {  // 3D interpolation
+      Interpolate3DCUDABwd<T>(ctx, input_grad, *output_grad);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(trilinear_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(trilinear_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(linear_interp_v2, ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(linear_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(bicubic_interp_v2,
+                        ops::InterpolateOpV2CUDAKernel<float>,
+                        ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(bicubic_interp_v2_grad,
+                        ops::InterpolateV2GradOpCUDAKernel<float>,
+                        ops::InterpolateV2GradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..111766934b8300c0a7b46ae9a065b8c42460e577
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -0,0 +1,1386 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+inline std::vector<int> get_new_shape(
+    const std::vector<const Tensor*>& list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    PADDLE_ENFORCE_EQ(
+        tensor->dims(), framework::make_ddim({1}),
+        platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
+    if (platform::is_gpu_place(tensor->place())) {
+      framework::Tensor temp;
+      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+      vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
+    } else {
+      vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+    }
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  framework::Tensor cpu_starts_tensor;
+  if (platform::is_gpu_place(new_data_tensor->place())) {
+    TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
+    new_data = cpu_starts_tensor.data<T>();
+  }
+  vec_new_data = std::vector<T>(new_data, new_data + new_data_tensor->numel());
+  return vec_new_data;
+}
+
+inline void ExtractNCDWH(const framework::DDim& dims,
+                         const DataLayout& data_layout, int* N, int* C, int* D,
+                         int* H, int* W) {
+  *N = dims[0];
+
+  if (dims.size() == 3) {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[2];
+    *D = 1;
+    *H = 1;
+    *W = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+  } else if (dims.size() == 4) {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[3];
+    *D = 1;
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = data_layout == DataLayout::kNCHW ? dims[3] : dims[2];
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[4];
+    *D = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *H = data_layout == DataLayout::kNCHW ? dims[3] : dims[2];
+    *W = data_layout == DataLayout::kNCHW ? dims[4] : dims[3];
+  }
+}
+
+template <typename T>
+static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
+                                       const float ratio_h, const float ratio_w,
+                                       const int n, const int c,
+                                       const int out_h, const int out_w,
+                                       const bool align_corners,
+                                       const DataLayout& data_layout) {
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto output_t = EigenTensor<T, 4>::From(*output);
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                               : static_cast<int>(ratio_h * k);
+
+    for (int l = 0; l < out_w; l++) {
+      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                 : static_cast<int>(ratio_w * l);
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          if (data_layout == DataLayout::kNCHW) {
+            output_t(i, j, k, l) = input_t(i, j, in_k, in_l);
+          } else {
+            output_t(i, k, l, j) = input_t(i, in_k, in_l, j);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void LinearInterpolation(const Tensor& input, Tensor* output,
+                                const float ratio_w, const int in_w,
+                                const int n, const int c, const int out_w,
+                                const bool align_corners, const bool align_mode,
+                                const DataLayout data_layout) {
+  auto input_t = EigenTensor<T, 3>::From(input);
+  auto output_t = EigenTensor<T, 3>::From(*output);
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int l = 0; l < out_w; l++) {
+    int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;                       // w
+    int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w;  // w_id
+
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;  // w1lambda
+    float d_e = 1.f - d_w;                                         // w2lambda
+    {
+      vx_w[l] = x_w;
+      vx_e[l] = x_e;
+      vd_w[l] = d_w;
+      vd_e[l] = d_e;
+    }
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+  for (int i = 0; i < n; i++) {    // loop for batches
+    for (int j = 0; j < c; j++) {  // loop for channels
+      for (int l = 0; l < out_w; l++) {
+        // linear interpolation
+        T out_t;
+        if (data_layout == DataLayout::kNCHW) {
+          out_t = input_t(i, j, vx_w[l]) * vd_e[l] +
+                  input_t(i, j, vx_e[l]) * vd_w[l];
+          output_t(i, j, l) = out_t;
+        } else {
+          out_t = input_t(i, vx_w[l], j) * vd_e[l] +
+                  input_t(i, vx_e[l], j) * vd_w[l];
+          output_t(i, l, j) = out_t;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void LinearInterpolationGrad(const Tensor& output_grad,
+                                    Tensor* input_grad, const float ratio_w,
+                                    const int in_w, const int n, const int c,
+                                    const int out_w, const bool align_corners,
+                                    const int align_mode,
+                                    const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 3>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 3>::From(output_grad);
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (int l = 0; l < out_w; l++) {
+    int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;                       // w
+    int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w;  // w_id
+
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;  // w1lambda
+    float d_e = 1.f - d_w;                                         // w2lambda
+
+    for (int i = 0; i < n; i++) {    // loop for batches
+      for (int j = 0; j < c; j++) {  // loop for channels
+        // linear interpolation grad
+        if (data_layout == DataLayout::kNCHW) {
+          const T grad = output_grad_t(i, j, l);
+          input_grad_t(i, j, x_w) += static_cast<T>(grad * d_e);
+          input_grad_t(i, j, x_e) += static_cast<T>(grad * d_w);
+        } else {
+          const T grad = output_grad_t(i, l, j);
+          input_grad_t(i, x_w, j) += static_cast<T>(grad * d_e);
+          input_grad_t(i, x_e, j) += static_cast<T>(grad * d_w);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void BilinearInterpolation(const Tensor& input, Tensor* output,
+                                  const float ratio_h, const float ratio_w,
+                                  const int in_h, const int in_w, const int n,
+                                  const int c, const int out_h, const int out_w,
+                                  const bool align_corners,
+                                  const bool align_mode,
+                                  const DataLayout data_layout) {
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto output_t = EigenTensor<T, 4>::From(*output);
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vy_n, vy_s;
+  std::vector<float> vd_n, vd_s;
+  vy_n.reserve(out_h);
+  vy_s.reserve(out_h);
+  vd_n.reserve(out_h);
+  vd_s.reserve(out_h);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int k = 0; k < out_h; k++) {
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
+    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_s = 1.f - d_n;
+    {
+      vy_n[k] = y_n;
+      vy_s[k] = y_s;
+      vd_n[k] = d_n;
+      vd_s[k] = d_s;
+    }
+  }
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int l = 0; l < out_w; l++) {
+    int x_w = (align_mode == 0 && !align_corners)
+                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                  : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;
+    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+    float d_e = 1.f - d_w;
+    {
+      vx_w[l] = x_w;
+      vx_e[l] = x_e;
+      vd_w[l] = d_w;
+      vd_e[l] = d_e;
+    }
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(4)
+#endif
+  for (int i = 0; i < n; i++) {          // loop for batches
+    for (int j = 0; j < c; j++) {        // loop for channels
+      for (int k = 0; k < out_h; k++) {  // loop for images
+        for (int l = 0; l < out_w; l++) {
+          // bilinear interpolation
+          T out_t;
+          if (data_layout == DataLayout::kNCHW) {
+            out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] +
+                    input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] +
+                    input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] +
+                    input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l];
+            output_t(i, j, k, l) = out_t;
+
+          } else {
+            out_t = input_t(i, vy_n[k], vx_w[l], j) * vd_s[k] * vd_e[l] +
+                    input_t(i, vy_s[k], vx_w[l], j) * vd_n[k] * vd_e[l] +
+                    input_t(i, vy_n[k], vx_e[l], j) * vd_s[k] * vd_w[l] +
+                    input_t(i, vy_s[k], vx_e[l], j) * vd_n[k] * vd_w[l];
+            output_t(i, k, l, j) = out_t;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void TrilinearInterpolation(
+    const Tensor& input, Tensor* output, const float ratio_d,
+    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
+    const int in_w, const int n, const int c, const int out_d, const int out_h,
+    const int out_w, const bool align_corners, const bool align_mode,
+    const DataLayout& data_layout) {
+  auto input_t = EigenTensor<T, 5>::From(input);
+  auto output_t = EigenTensor<T, 5>::From(*output);
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vt_f, vt_b;
+  std::vector<float> vd_f, vd_b;
+  vt_f.reserve(out_d);
+  vt_b.reserve(out_d);
+  vd_f.reserve(out_d);
+  vd_b.reserve(out_d);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int j = 0; j < out_d; j++) {
+    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * j);
+    t_f = (t_f > 0) ? t_f : 0;
+    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
+    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
+    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
+    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
+    float d_b = 1.f - d_f;
+    {
+      vt_f[j] = t_f;
+      vt_b[j] = t_b;
+      vd_f[j] = d_f;
+      vd_b[j] = d_b;
+    }
+  }
+
+  std::vector<int> vy_n, vy_s;
+  std::vector<float> vd_n, vd_s;
+  vy_n.reserve(out_h);
+  vy_s.reserve(out_h);
+  vd_n.reserve(out_h);
+  vd_s.reserve(out_h);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int k = 0; k < out_h; k++) {
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
+    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_s = 1.f - d_n;
+    {
+      vy_n[k] = y_n;
+      vy_s[k] = y_s;
+      vd_n[k] = d_n;
+      vd_s[k] = d_s;
+    }
+  }
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int l = 0; l < out_w; l++) {
+    int x_w = (align_mode == 0 && !align_corners)
+                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                  : static_cast<int>(ratio_w * l);
+    x_w = (x_w > 0) ? x_w : 0;
+    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+    float d_e = 1.f - d_w;
+    {
+      vx_w[l] = x_w;
+      vx_e[l] = x_e;
+      vd_w[l] = d_w;
+      vd_e[l] = d_e;
+    }
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(5)
+#endif
+  for (int b = 0; b < n; b++) {          // loop for batches
+    for (int i = 0; i < c; i++) {        // loop for channels
+      for (int j = 0; j < out_d; j++) {  // loop for D, H, W
+        for (int k = 0; k < out_h; k++) {
+          for (int l = 0; l < out_w; l++) {
+            // trilinear interpolation
+            if (data_layout == DataLayout::kNCHW) {
+              T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] *
+                            vd_n[k] * vd_w[l] +
+                        input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] *
+                            vd_n[k] * vd_w[l];
+              output_t(b, i, j, k, l) = out_t;
+            } else {
+              T out_t = input_t(b, vt_f[j], vy_n[k], vx_w[l], i) * vd_b[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, vt_f[j], vy_n[k], vx_e[l], i) * vd_b[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, vt_f[j], vy_s[k], vx_w[l], i) * vd_b[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, vt_f[j], vy_s[k], vx_e[l], i) * vd_b[j] *
+                            vd_n[k] * vd_w[l] +
+                        input_t(b, vt_b[j], vy_n[k], vx_w[l], i) * vd_f[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, vt_b[j], vy_n[k], vx_e[l], i) * vd_f[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, vt_b[j], vy_s[k], vx_w[l], i) * vd_f[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, vt_b[j], vy_s[k], vx_e[l], i) * vd_f[j] *
+                            vd_n[k] * vd_w[l];
+              output_t(b, j, k, l, i) = out_t;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+HOSTDEVICE inline T cubic_convolution1(T x, T A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename T>
+HOSTDEVICE inline T cubic_convolution2(T x, T A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename T>
+HOSTDEVICE inline void get_cubic_upsample_coefficients(T coeffs[4], T t) {
+  T A = -0.75;
+
+  T x1 = t;
+  coeffs[0] = cubic_convolution2<T>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<T>(x1, A);
+
+  // opposite coefficients
+  T x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<T>(x2, A);
+  coeffs[3] = cubic_convolution2<T>(x2 + 1.0, A);
+}
+
+template <typename T>
+static inline T cubic_interp(T x0, T x1, T x2, T x3, T t) {
+  T coeffs[4];
+  get_cubic_upsample_coefficients<T>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+template <typename T>
+static void BicubicInterpolation(const Tensor& input, Tensor* output,
+                                 const float ratio_h, const float ratio_w,
+                                 const int in_h, const int in_w, const int n,
+                                 const int c, const int out_h, const int out_w,
+                                 const bool align_corners,
+                                 const DataLayout data_layout) {
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto output_t = EigenTensor<T, 4>::From(*output);
+
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    T y_n = align_corners ? static_cast<T>(ratio_h * k)
+                          : static_cast<T>(ratio_h * (k + 0.5) - 0.5);
+    int input_y = floorf(y_n);
+    const T y_t = y_n - input_y;
+
+    for (int l = 0; l < out_w; l++) {
+      T x_n = align_corners ? static_cast<T>(ratio_w * l)
+                            : static_cast<T>(ratio_w * (l + 0.5) - 0.5);
+      int input_x = floorf(x_n);
+      const T x_t = x_n - input_x;
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          T coefficients[4];
+          // interp 4 times in x direction
+          for (int ii = 0; ii < 4; ii++) {
+            int access_y = std::max(std::min(input_y - 1 + ii, in_h - 1),
+                                    static_cast<int>(0));
+            int access_x_0 =
+                std::max(std::min(input_x - 1, in_w - 1), static_cast<int>(0));
+            int access_x_1 =
+                std::max(std::min(input_x + 0, in_w - 1), static_cast<int>(0));
+            int access_x_2 =
+                std::max(std::min(input_x + 1, in_w - 1), static_cast<int>(0));
+            int access_x_3 =
+                std::max(std::min(input_x + 2, in_w - 1), static_cast<int>(0));
+            if (data_layout == DataLayout::kNCHW) {
+              coefficients[ii] =
+                  cubic_interp<T>(input_t(i, j, access_y, access_x_0),
+                                  input_t(i, j, access_y, access_x_1),
+                                  input_t(i, j, access_y, access_x_2),
+                                  input_t(i, j, access_y, access_x_3), x_t);
+            } else {
+              coefficients[ii] =
+                  cubic_interp<T>(input_t(i, access_y, access_x_0, j),
+                                  input_t(i, access_y, access_x_1, j),
+                                  input_t(i, access_y, access_x_2, j),
+                                  input_t(i, access_y, access_x_3, j), x_t);
+            }
+          }
+
+          // interp y direction
+          if (data_layout == DataLayout::kNCHW) {
+            output_t(i, j, k, l) =
+                cubic_interp<T>(coefficients[0], coefficients[1],
+                                coefficients[2], coefficients[3], y_t);
+          } else {
+            output_t(i, k, l, j) =
+                cubic_interp<T>(coefficients[0], coefficients[1],
+                                coefficients[2], coefficients[3], y_t);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void NearestNeighborInterpolateGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
+    const float ratio_w, const int n, const int c, const int out_h,
+    const int out_w, const bool align_corners, const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                               : static_cast<int>(ratio_h * k);
+
+    for (int l = 0; l < out_w; l++) {
+      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                 : static_cast<int>(ratio_w * l);
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          if (data_layout == DataLayout::kNCHW) {
+            input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l);
+          } else {
+            input_grad_t(i, in_k, in_l, j) += output_grad_t(i, k, l, j);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void BilinearInterpolationGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
+    const float ratio_w, const int in_h, const int in_w, const int n,
+    const int c, const int out_h, const int out_w, const bool align_corners,
+    const int align_mode, const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
+    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_s = 1.f - d_n;
+
+    for (int l = 0; l < out_w; l++) {
+      int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                           : static_cast<int>(ratio_w * l);
+      x_w = (x_w > 0) ? x_w : 0;
+      int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+      float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+      idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+      float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+      float d_e = 1.f - d_w;
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          // bilinear interpolation grad
+          if (data_layout == DataLayout::kNCHW) {
+            const T grad = output_grad_t(i, j, k, l);
+            input_grad_t(i, j, y_n, x_w) += static_cast<T>(grad * d_s * d_e);
+            input_grad_t(i, j, y_s, x_w) += static_cast<T>(grad * d_n * d_e);
+            input_grad_t(i, j, y_n, x_e) += static_cast<T>(grad * d_s * d_w);
+            input_grad_t(i, j, y_s, x_e) += static_cast<T>(grad * d_n * d_w);
+          } else {
+            const T grad = output_grad_t(i, k, l, j);
+            input_grad_t(i, y_n, x_w, j) += static_cast<T>(grad * d_s * d_e);
+            input_grad_t(i, y_s, x_w, j) += static_cast<T>(grad * d_n * d_e);
+            input_grad_t(i, y_n, x_e, j) += static_cast<T>(grad * d_s * d_w);
+            input_grad_t(i, y_s, x_e, j) += static_cast<T>(grad * d_n * d_w);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void TrilinearInterpolationGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_d,
+    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
+    const int in_w, const int n, const int c, const int out_d, const int out_h,
+    const int out_w, const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 5>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 5>::From(output_grad);
+  bool align_flag = (align_mode == 0 && !align_corners);
+  for (int j = 0; j < out_d; j++) {  // loop for D
+    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
+                         : static_cast<int>(ratio_d * j);
+    t_f = (t_f > 0) ? t_f : 0;
+    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
+    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
+    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
+    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
+    float d_b = 1.f - d_f;
+
+    for (int k = 0; k < out_h; k++) {  // loop for H
+      int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                           : static_cast<int>(ratio_h * k);
+      y_n = (y_n > 0) ? y_n : 0;
+      int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
+      float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+      idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+      float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+      float d_s = 1.f - d_n;
+
+      for (int l = 0; l < out_w; l++) {  // loop for W
+        int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                             : static_cast<int>(ratio_w * l);
+        x_w = (x_w > 0) ? x_w : 0;
+        int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+        float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+        idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+        float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+        float d_e = 1.f - d_w;
+
+        for (int b = 0; b < n; b++) {    // loop for batches
+          for (int i = 0; i < c; i++) {  // loop for channels
+            // trilinear interpolation grad
+            if (data_layout == DataLayout::kNCHW) {
+              const T grad = output_grad_t(b, i, j, k, l);
+              input_grad_t(b, i, t_f, y_n, x_w) +=
+                  static_cast<T>(grad * d_b * d_s * d_e);
+              input_grad_t(b, i, t_f, y_n, x_e) +=
+                  static_cast<T>(grad * d_b * d_s * d_w);
+              input_grad_t(b, i, t_f, y_s, x_w) +=
+                  static_cast<T>(grad * d_b * d_n * d_e);
+              input_grad_t(b, i, t_f, y_s, x_e) +=
+                  static_cast<T>(grad * d_b * d_n * d_w);
+              input_grad_t(b, i, t_b, y_n, x_w) +=
+                  static_cast<T>(grad * d_f * d_s * d_e);
+              input_grad_t(b, i, t_b, y_n, x_e) +=
+                  static_cast<T>(grad * d_f * d_s * d_w);
+              input_grad_t(b, i, t_b, y_s, x_w) +=
+                  static_cast<T>(grad * d_f * d_n * d_e);
+              input_grad_t(b, i, t_b, y_s, x_e) +=
+                  static_cast<T>(grad * d_f * d_n * d_w);
+            } else {
+              const T grad = output_grad_t(b, j, k, l, i);
+              input_grad_t(b, t_f, y_n, x_w, i) +=
+                  static_cast<T>(grad * d_b * d_s * d_e);
+              input_grad_t(b, t_f, y_n, x_e, i) +=
+                  static_cast<T>(grad * d_b * d_s * d_w);
+              input_grad_t(b, t_f, y_s, x_w, i) +=
+                  static_cast<T>(grad * d_b * d_n * d_e);
+              input_grad_t(b, t_f, y_s, x_e, i) +=
+                  static_cast<T>(grad * d_b * d_n * d_w);
+              input_grad_t(b, t_b, y_n, x_w, i) +=
+                  static_cast<T>(grad * d_f * d_s * d_e);
+              input_grad_t(b, t_b, y_n, x_e, i) +=
+                  static_cast<T>(grad * d_f * d_s * d_w);
+              input_grad_t(b, t_b, y_s, x_w, i) +=
+                  static_cast<T>(grad * d_f * d_n * d_e);
+              input_grad_t(b, t_b, y_s, x_e, i) +=
+                  static_cast<T>(grad * d_f * d_n * d_w);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void BicubicInterpolationGrad(const Tensor& output_grad,
+                                     Tensor* input_grad, const float ratio_h,
+                                     const float ratio_w, const int in_h,
+                                     const int in_w, const int n, const int c,
+                                     const int out_h, const int out_w,
+                                     const bool align_corners,
+                                     const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  for (int k = 0; k < out_h; k++) {  // loop for images
+    T y_n = align_corners ? static_cast<T>(ratio_h * k)
+                          : static_cast<T>(ratio_h * (k + 0.5) - 0.5);
+    int input_y = floorf(y_n);
+    T y_t = y_n - input_y;
+
+    for (int l = 0; l < out_w; l++) {
+      T x_n = align_corners ? static_cast<T>(ratio_w * l)
+                            : static_cast<T>(ratio_w * (l + 0.5) - 0.5);
+      int input_x = floorf(x_n);
+      T x_t = x_n - input_x;
+
+      T x_coeffs[4];
+      T y_coeffs[4];
+
+      get_cubic_upsample_coefficients<T>(x_coeffs, x_t);
+      get_cubic_upsample_coefficients<T>(y_coeffs, y_t);
+
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
+          // bicubic interpolation grad
+          for (int ii = 0; ii < 4; ii++) {
+            for (int jj = 0; jj < 4; jj++) {
+              int access_x = std::max(std::min(input_x - 1 + ii, in_w - 1),
+                                      static_cast<int>(0));
+              int access_y = std::max(std::min(input_y - 1 + jj, in_h - 1),
+                                      static_cast<int>(0));
+              if (data_layout == DataLayout::kNCHW) {
+                T grad = output_grad_t(i, j, k, l);
+                input_grad_t(i, j, access_y, access_x) +=
+                    grad * y_coeffs[jj] * x_coeffs[ii];
+              } else {
+                T grad = output_grad_t(i, k, l, j);
+                input_grad_t(i, access_y, access_x, j) +=
+                    grad * y_coeffs[jj] * x_coeffs[ii];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
+                                const Tensor& input, Tensor* output) {
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_w = new_size[0];
+  } else {
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale_w = scale_data[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    } else {
+      if (scale.size() > 0) {
+        scale_w = scale[0];
+
+        PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                                 "scale  of Op(interpolate) "
+                                                 "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0.) {
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_w = out_size_data[0];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_w};
+  } else {
+    dim_out = {n, out_w, c};
+  }
+  output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+  if ("linear" == interp_method) {
+    LinearInterpolation<T>(input, output, ratio_w, in_w, n, c, out_w,
+                           align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
+                                const Tensor& input, Tensor* output) {
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else {
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_h = scale_data[0];
+        scale_w = scale_data[1];
+      } else {
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_h = scale[0];
+        scale_w = scale[1];
+
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_h > 0. && scale_w > 0.) {
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_h, out_w};
+  } else {
+    dim_out = {n, out_h, out_w, c};
+  }
+  output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("bilinear" == interp_method) {
+    BilinearInterpolation<T>(input, output, ratio_h, ratio_w, in_h, in_w, n, c,
+                             out_h, out_w, align_corners, align_mode,
+                             data_layout);
+  } else if ("nearest" == interp_method) {
+    NearestNeighborInterpolate<T>(input, output, ratio_h, ratio_w, n, c, out_h,
+                                  out_w, align_corners, data_layout);
+  } else if ("bicubic" == interp_method) {
+    BicubicInterpolation<T>(input, output, ratio_h, ratio_w, in_h, in_w, n, c,
+                            out_h, out_w, align_corners, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
+                                const Tensor& input, Tensor* output) {
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  } else {
+    float scale_d = -1;
+    float scale_h = -1;
+    float scale_w = -1;
+    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale = ctx.Attr<std::vector<float>>("scale");
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      if (scale_data.size() > 1) {
+        scale_d = scale_data[0];
+        scale_h = scale_data[1];
+        scale_w = scale_data[2];
+      } else {
+        scale_d = scale_data[0];
+        scale_h = scale_data[0];
+        scale_w = scale_data[0];
+      }
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    } else {
+      if (scale.size() > 1) {
+        scale_d = scale[0];
+        scale_h = scale[1];
+        scale_w = scale[2];
+
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0 && scale_d, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      }
+    }
+    if (scale_w > 0. && scale_h > 0. && scale_d > 0.) {
+      out_d = static_cast<int>(in_d * scale_d);
+      out_h = static_cast<int>(in_h * scale_h);
+      out_w = static_cast<int>(in_w * scale_w);
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_d = out_size_data[0];
+      out_h = out_size_data[1];
+      out_w = out_size_data[2];
+    }
+  }
+  PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument(
+                                  "out_d in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
+                                  "out_h in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
+                                  "out_w in Attr(out_shape) of Op(interpolate) "
+                                  "should be greater than 0."));
+
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_d, out_h, out_w};
+  } else {
+    dim_out = {n, out_d, out_h, out_w, c};
+  }
+
+  output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(input, ctx.GetPlace(), output);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("trilinear" == interp_method) {
+    TrilinearInterpolation<T>(input, output, ratio_d, ratio_h, ratio_w, in_d,
+                              in_h, in_w, n, c, out_d, out_h, out_w,
+                              align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
+                                Tensor* input_grad, const Tensor& output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_w = -1.0;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    scale_w = scale_data[0];
+    PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                             "scale  of Op(interpolate) "
+                                             "should be greater than 0."));
+  } else {
+    if (scale.size() > 0) {
+      scale_w = scale[0];
+      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
+                                               "scale  of Op(interpolate) "
+                                               "should be greater than 0."));
+    }
+  }
+  if (scale_w > 0.) {
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    auto out_size_data = get_new_data_from_tensor<int>(out_size);
+    out_w = out_size_data[0];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_w = new_size[0];
+  }
+
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_w};
+  } else {
+    dim_grad = {n, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+
+  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_w = 0.f;
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+  if ("linear" == interp_method) {
+    LinearInterpolationGrad<T>(output_grad, input_grad, ratio_w, in_w, n, c,
+                               out_w, align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
+                                Tensor* input_grad, const Tensor& output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_h = scale_data[0];
+      scale_w = scale_data[1];
+    } else {
+      scale_w = scale_data[0];
+      scale_h = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_h = scale[0];
+      scale_w = scale[1];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_h > 0. && scale_w > 0.) {
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    auto out_size_data = get_new_data_from_tensor<int>(out_size);
+    out_h = out_size_data[0];
+    out_w = out_size_data[1];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  }
+
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_h, in_w};
+  } else {
+    dim_grad = {n, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+
+  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("bilinear" == interp_method) {
+    BilinearInterpolationGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
+                                 in_h, in_w, n, c, out_h, out_w, align_corners,
+                                 align_mode, data_layout);
+  } else if ("nearest" == interp_method) {
+    NearestNeighborInterpolateGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
+                                      n, c, out_h, out_w, align_corners,
+                                      data_layout);
+  } else if ("bicubic" == interp_method) {
+    BicubicInterpolationGrad<T>(output_grad, input_grad, ratio_h, ratio_w, in_h,
+                                in_w, n, c, out_h, out_w, align_corners,
+                                data_layout);
+  }
+}
+
+template <typename T>
+static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
+                                Tensor* input_grad, const Tensor output_grad) {
+  auto* input = ctx.Input<Tensor>("X");
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
+  int n, c, in_d, in_h, in_w;
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+  auto interp_method = ctx.Attr<std::string>("interp_method");
+  bool align_corners = ctx.Attr<bool>("align_corners");
+  int align_mode = ctx.Attr<int>("align_mode");
+
+  int out_d = ctx.Attr<int>("out_d");
+  int out_h = ctx.Attr<int>("out_h");
+  int out_w = ctx.Attr<int>("out_w");
+  float scale_d = -1;
+  float scale_h = -1;
+  float scale_w = -1;
+  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale = ctx.Attr<std::vector<float>>("scale");
+  if (scale_tensor != nullptr) {
+    auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+    if (scale_data.size() > 1) {
+      scale_d = scale_data[0];
+      scale_h = scale_data[1];
+      scale_w = scale_data[2];
+    } else {
+      scale_d = scale_data[0];
+      scale_h = scale_data[0];
+      scale_w = scale_data[0];
+    }
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+        platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                          "should be greater than 0."));
+  } else {
+    if (scale.size() > 1) {
+      scale_d = scale[0];
+      scale_h = scale[1];
+      scale_w = scale[2];
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
+          platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                            "should be greater than 0."));
+    }
+  }
+  if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
+    out_d = static_cast<int>(in_d * scale_d);
+    out_h = static_cast<int>(in_h * scale_h);
+    out_w = static_cast<int>(in_w * scale_w);
+  }
+  auto out_size = ctx.Input<Tensor>("OutSize");
+  if (out_size != nullptr) {
+    auto out_size_data = get_new_data_from_tensor<int>(out_size);
+    out_d = out_size_data[0];
+    out_h = out_size_data[1];
+    out_w = out_size_data[2];
+  }
+  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  if (list_new_size_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_size_tensor);
+    out_d = new_size[0];
+    out_h = new_size[1];
+    out_w = new_size[2];
+  }
+
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_d, in_h, in_w};
+  } else {
+    dim_grad = {n, in_d, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+  if (in_d == out_d && in_h == out_h && in_w == out_w) {
+    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
+    return;
+  }
+
+  float ratio_d = 0.f;
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_d > 1) {
+    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
+                              : static_cast<float>(in_d) / out_d;
+  }
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  if ("trilinear" == interp_method) {
+    TrilinearInterpolationGrad<T>(
+        output_grad, input_grad, ratio_d, ratio_h, ratio_w, in_d, in_h, in_w, n,
+        c, out_d, out_h, out_w, align_corners, align_mode, data_layout);
+  }
+}
+
+template <typename T>
+class InterpolateV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto input_dims = input->dims();
+    if (input_dims.size() == 3) {  // 1D interpolation
+      Interpolate1DCPUFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 4) {  // 2D interpolation
+      Interpolate2DCPUFwd<T>(ctx, *input, output);
+    } else if (input_dims.size() == 5) {  // 3D interpolation
+      Interpolate3DCPUFwd<T>(ctx, *input, output);
+    }
+  }
+};
+
+template <typename T>
+class InterpolateV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto output_grad_dims = output_grad->dims();
+    if (output_grad_dims.size() == 3) {  // 1D interpolation grad
+      Interpolate1DCPUBwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 4) {  // 2D interpolation grad
+      Interpolate2DCPUBwd<T>(ctx, input_grad, *output_grad);
+    } else if (output_grad_dims.size() == 5) {  // 3D interpolation grad
+      Interpolate3DCPUBwd<T>(ctx, input_grad, *output_grad);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 86feaa72d5fa69cd5d76e56182c27b8d048e4c74..a4bdc923eecc3e1283e642e592f91d658e7c9aa7 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sampler.h"
+
 #include <glog/logging.h>
+
 #include <iostream>
 #include <queue>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/generator.h"
 
 namespace paddle {
@@ -28,22 +31,17 @@ Sampler::~Sampler() {}
 
 UniformSampler::UniformSampler(int64_t range, unsigned int seed)
     : Sampler(range, seed), inv_range_(1.0 / (range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
+  random_engine_ = framework::GetCPURandomEngine(seed_);
   dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
 }
 
-int64_t UniformSampler::Sample() const {
-  return framework::Generator::GetInstance()->is_init_py
-             ? (*dist_)(framework::Generator::GetInstance()->GetCPUEngine())
-             : (*dist_)(*random_engine_);
-  // return (*dist_)(*random_engine_);
-}
+int64_t UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
 
 float UniformSampler::Probability(int64_t value) const { return inv_range_; }
 
 LogUniformSampler::LogUniformSampler(int64_t range, unsigned int seed)
     : Sampler(range, seed), log_range_(log(range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
+  random_engine_ = framework::GetCPURandomEngine(seed_);
   dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
 }
 
@@ -52,10 +50,7 @@ int64_t LogUniformSampler::Sample() const {
   // inverse_transform_sampling method
   // More details:
   // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
-  auto cur_random =
-      framework::Generator::GetInstance()->is_init_py
-          ? (*dist_)(framework::Generator::GetInstance()->GetCPUEngine())
-          : (*dist_)(*random_engine_);
+  auto cur_random = (*dist_)(*random_engine_);
   const int64_t value = static_cast<int64_t>(exp(cur_random * log_range_)) - 1;
   // Mathematically, value should be <= range_, but might not be due to some
   // floating point roundoff, so we mod by range_.
@@ -74,7 +69,7 @@ CustomSampler::CustomSampler(int64_t range, const float *probabilities,
                              const int *alias, const float *alias_probabilities,
                              unsigned int seed)
     : Sampler(range, seed) {
-  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  random_engine_ = framework::GetCPURandomEngine(seed_);
   real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
   int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
 
@@ -84,14 +79,8 @@ CustomSampler::CustomSampler(int64_t range, const float *probabilities,
 }
 
 int64_t CustomSampler::Sample() const {
-  auto index =
-      framework::Generator::GetInstance()->is_init_py
-          ? (*int_dist_)(framework::Generator::GetInstance()->GetCPUEngine())
-          : (*int_dist_)(*random_engine_);
-  auto p =
-      framework::Generator::GetInstance()->is_init_py
-          ? (*real_dist_)(framework::Generator::GetInstance()->GetCPUEngine())
-          : (*real_dist_)(*random_engine_);
+  auto index = (*int_dist_)(*random_engine_);
+  auto p = (*real_dist_)(*random_engine_);
   if (p > alias_probs_[index]) {
     int alias = alias_[index];
 
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 3fa5a7ae336a9be984324411b88570aea99c2c78..480576ef9dc8c21811a1a867d553ccc6d97fa22a 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -26,8 +26,8 @@ namespace math {
 // TODO(wanghaoshuang): Support for GPU
 
 /**
-* Sample integers from [0, range).
-*/
+ * Sample integers from [0, range).
+ */
 class Sampler {
  public:
   explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
@@ -117,7 +117,7 @@ class CustomSampler : public Sampler {
   const int* alias_;
   const float* probs_;
   const int exceptional_val = -1;
-  std::shared_ptr<std::mt19937> random_engine_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
   std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
   std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
 };
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index a50cc22e5bb0def54b057dcc23d2f6751eecc478..40737f4cd029b47dbd03069a2e4d29ad33121eb9 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -44,6 +44,7 @@ class FCPrimitiveFactory {
 
   void ExecuteFcPrimitive(const LoDTensor* input, const Tensor* weights,
                           const Tensor* bias, LoDTensor* output,
+                          const MKLDNNDeviceContext& dev_ctx,
                           const ExecutionContext& ctx) {
     RecomputeOutputDims(ctx, input, weights, output);
     // If primitive has already been created and cached, don't create new one,
@@ -74,8 +75,8 @@ class FCPrimitiveFactory {
               "input format is equal to ncw."));
     }
 
-    // Transform weights to default MKL-DNN format
-    weights_ = TransposeWeights(weights);
+    weights_ = CreateWeightsMemory(weights);
+
     // Since MKL-DNN has a lot of limitations on what the input/weights/output
     // dimensions should be, to simplify the code, the creation of primitive
     // descriptor has been divided into separate cases, based on the number
@@ -112,10 +113,13 @@ class FCPrimitiveFactory {
     // Quantize weights and reorder to format chosen by FC primitive descriptor.
     QuantizeWeights(ctx, fc_prim_desc->weights_desc());
 
-    bias_ = CreateMemory<float>(fc_prim_desc->bias_desc(), bias);
+    bias_ = CreateMemoryToBeCached<float>(fc_prim_desc->bias_desc(), bias);
     // If int8 is desired, quantize bias into 32-bit signed int
     QuantizeBias(*fc_prim_desc, ctx);
 
+    // Store weights and bias in the mkldnn cache
+    CacheWeightsAndBias(dev_ctx, ctx);
+
     // Based on format determined by inner_product, create output in desired
     // memory format
     output_ = CreateDstMemory(*fc_prim_desc, ctx, output);
@@ -262,14 +266,15 @@ class FCPrimitiveFactory {
   }
 
   // Convert data from one data format to another
-  mkldnn::memory Reorder(const memory::desc& src_desc,
-                         const memory::desc& dst_desc, void* src_data) {
+  std::shared_ptr<mkldnn::memory> Reorder(const memory::desc& src_desc,
+                                          const memory::desc& dst_desc,
+                                          void* src_data) {
     auto src_mem = memory(src_desc, engine_, src_data);
-    auto dst_mem = memory(dst_desc, engine_);
+    auto dst_mem = std::make_shared<memory>(dst_desc, engine_);
 
-    auto reorder = mkldnn::reorder(src_mem, dst_mem);
+    auto reorder = mkldnn::reorder(src_mem, *dst_mem);
     mkldnn::stream astream(engine_);
-    reorder.execute(astream, src_mem, dst_mem);
+    reorder.execute(astream, src_mem, *dst_mem);
     astream.wait();
 
     return dst_mem;
@@ -277,9 +282,10 @@ class FCPrimitiveFactory {
 
   // Convert data from one data format to another and rescale it.
   // If the desired data type is (un)signed int8, quantization occurs here.
-  mkldnn::memory Reorder(const memory& src_mem, const memory::desc& dst_md,
-                         const std::vector<float>& scale_data) {
-    mkldnn::memory dst_mem = mkldnn::memory(dst_md, engine_);
+  std::shared_ptr<mkldnn::memory> ReorderWithScale(
+      const std::shared_ptr<memory> src_mem, const memory::desc& dst_md,
+      const std::vector<float>& scale_data) {
+    auto dst_mem = std::make_shared<mkldnn::memory>(dst_md, engine_);
     mkldnn::primitive_attr attributes;
     // According to MKL-DNN's documentation mask determines along which
     // dimensions should the scale be applied.
@@ -289,11 +295,11 @@ class FCPrimitiveFactory {
     //     becuase we perform per-output-channel quantization
     int mask = CreateMask(0, scale_data.size() > 1);
     attributes.set_output_scales(mask, scale_data);
-    auto reorder = mkldnn::reorder(src_mem, dst_mem, attributes);
+    auto reorder = mkldnn::reorder(*src_mem, *dst_mem, attributes);
 
     mkldnn::stream astream(engine_);
     reorder.execute(astream,
-                    {{MKLDNN_ARG_FROM, src_mem}, {MKLDNN_ARG_TO, dst_mem}});
+                    {{MKLDNN_ARG_FROM, *src_mem}, {MKLDNN_ARG_TO, *dst_mem}});
     astream.wait();
 
     return dst_mem;
@@ -323,16 +329,38 @@ class FCPrimitiveFactory {
     return memory(desc, engine_, data);
   }
 
-  // Transpose weights through MKL-DNN's reorder from io to oi format.
-  mkldnn::memory TransposeWeights(const Tensor* weights) {
+  template <typename T>
+  std::shared_ptr<mkldnn::memory> CreateMemoryToBeCached(
+      const mkldnn::memory::desc& desc, const Tensor* tensor) {
+    return CreateMemoryToBeCached(desc,
+                                  platform::to_void_cast<T>(tensor->data<T>()));
+  }
+
+  std::shared_ptr<mkldnn::memory> CreateMemoryToBeCached(
+      const mkldnn::memory::desc& desc, void* data) {
+    return std::make_shared<memory>(desc, engine_, data);
+  }
+
+  // Create weights memory and transform to default MKL-DNN format
+  std::shared_ptr<mkldnn::memory> CreateWeightsMemory(const Tensor* weights) {
     auto dims = framework::vectorize(weights->dims());
     std::swap(dims[0], dims[1]);  // Correct output dimensions
     auto src_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::io);
     auto dst_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oi);
+    // Transpose weights through MKL-DNN's reorder from io to oi format.
     return Reorder(src_desc, dst_desc,
                    platform::to_void_cast<float>(weights->data<float>()));
   }
 
+  void CacheWeightsAndBias(const MKLDNNDeviceContext& dev_ctx,
+                           const ExecutionContext& ctx) {
+    const std::string key = platform::CreateKey(platform::ThreadIDasStr());
+    const std::string weights_key = key + ctx.InputName("W");
+    const std::string bias_key = key + ctx.InputName("Bias");
+    dev_ctx.SetBlob(weights_key, weights_);
+    dev_ctx.SetBlob(bias_key, bias_);
+  }
+
   // Compute the bias scales so that its values correspond to the
   // scale of data being an output of weights and input multiplication
   std::vector<float> ComputeBiasScales(const ExecutionContext& ctx) {
@@ -388,14 +416,14 @@ class FCPrimitiveFactory {
   }
 
   void QuantizeWeights(const ExecutionContext& ctx, memory::desc dst) {
-    weights_ =
-        Reorder(*weights_, dst, ctx.Attr<std::vector<float>>("Scale_weights"));
+    weights_ = ReorderWithScale(weights_, dst,
+                                ctx.Attr<std::vector<float>>("Scale_weights"));
   }
 
   void QuantizeBias(const inner_product_forward::primitive_desc& fc_prim_desc,
                     const ExecutionContext& ctx) {
     auto bias_scales = ComputeBiasScales(ctx);
-    bias_ = Reorder(*bias_, fc_prim_desc.bias_desc(), bias_scales);
+    bias_ = ReorderWithScale(bias_, fc_prim_desc.bias_desc(), bias_scales);
   }
 
   // Fuse relu into FC with activation type attribute has been set to 'relu'
@@ -463,10 +491,10 @@ class FCPrimitiveFactory {
 
  private:
   const mkldnn::engine& engine_;
-  boost::optional<memory> bias_;
   boost::optional<memory> input_;
   boost::optional<memory> output_;
-  boost::optional<memory> weights_;
+  std::shared_ptr<memory> bias_;
+  std::shared_ptr<memory> weights_;
   boost::optional<inner_product_forward> fc_;
 };
 
@@ -476,19 +504,13 @@ class FCPrimitiveFactory {
 template <typename T_in, typename T_w, typename T_out>
 static std::shared_ptr<FCPrimitiveFactory<T_in, T_w, T_out>>
 GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx,
-                    const ExecutionContext& ctx, const Tensor* input,
-                    const Tensor* weights,
-                    const mkldnn::engine& mkldnn_engine) {
-  const std::string key = platform::CreateKey(
-      platform::ThreadIDasStr(), input->format(), input->dims()[0],
-      framework::vectorize<int>(weights->dims()), ctx.OutputName("Out"));
-
+                    const std::string& key) {
   auto prim_creator =
       std::static_pointer_cast<FCPrimitiveFactory<T_in, T_w, T_out>>(
           dev_ctx.GetBlob(key));
   if (prim_creator == nullptr) {
-    prim_creator =
-        std::make_shared<FCPrimitiveFactory<T_in, T_w, T_out>>(mkldnn_engine);
+    prim_creator = std::make_shared<FCPrimitiveFactory<T_in, T_w, T_out>>(
+        dev_ctx.GetEngine());
     dev_ctx.SetBlob(key, prim_creator);
   }
 
@@ -498,24 +520,24 @@ GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx,
 // Choose appropriate primitive factory implementation based on inferred
 // output type (uint8, int8 or float).
 template <typename T_in, typename T_w>
-static void ExecuteFc(const MKLDNNDeviceContext& dev_ctx,
-                      const ExecutionContext& ctx, const LoDTensor* input,
+static void ExecuteFc(const ExecutionContext& ctx, const LoDTensor* input,
                       const Tensor* w, const Tensor* bias, LoDTensor* output,
-                      const mkldnn::engine& mkldnn_engine, bool fuse_relu,
-                      bool force_fp32_output) {
+                      bool fuse_relu, bool force_fp32_output) {
+  auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const std::string prim_key = platform::CreateKey(
+      platform::ThreadIDasStr(), input->format(), input->dims()[0],
+      framework::vectorize<int>(w->dims()), ctx.OutputName("Out"));
   constexpr bool is_int8 =
       std::is_same<T_in, int8_t>::value || std::is_same<T_in, uint8_t>::value;
   if (!is_int8 || force_fp32_output) {
-    GetPrimitiveFactory<T_in, T_w, float>(dev_ctx, ctx, input, w, mkldnn_engine)
-        ->ExecuteFcPrimitive(input, w, bias, output, ctx);
+    GetPrimitiveFactory<T_in, T_w, float>(dev_ctx, prim_key)
+        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
   } else if (fuse_relu) {
-    GetPrimitiveFactory<T_in, T_w, uint8_t>(dev_ctx, ctx, input, w,
-                                            mkldnn_engine)
-        ->ExecuteFcPrimitive(input, w, bias, output, ctx);
+    GetPrimitiveFactory<T_in, T_w, uint8_t>(dev_ctx, prim_key)
+        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
   } else {
-    GetPrimitiveFactory<T_in, T_w, int8_t>(dev_ctx, ctx, input, w,
-                                           mkldnn_engine)
-        ->ExecuteFcPrimitive(input, w, bias, output, ctx);
+    GetPrimitiveFactory<T_in, T_w, int8_t>(dev_ctx, prim_key)
+        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
   }
 }
 
@@ -526,9 +548,6 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T_in> {
     PADDLE_ENFORCE_EQ(
         platform::is_cpu_place(ctx.GetPlace()), true,
         platform::errors::PreconditionNotMet("FC MKL-DNN must use CPUPlace."));
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
     auto input = ctx.Input<LoDTensor>("Input");
     auto w = ctx.Input<Tensor>("W");
     auto bias = ctx.Input<Tensor>("Bias");
@@ -537,8 +556,8 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T_in> {
     bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu";
     bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
-    ExecuteFc<T_in, T_w>(dev_ctx, ctx, input, w, bias, output, mkldnn_engine,
-                         fuse_relu, force_fp32_output);
+    ExecuteFc<T_in, T_w>(ctx, input, w, bias, output, fuse_relu,
+                         force_fp32_output);
 
     output->set_layout(DataLayout::kMKLDNN);
   }
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index d0ecca78ae8b27451bc51a3c1561609fc470a9f8..98200caca8cf66960632b88966f23e99fcd4c299 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #include "paddle/fluid/operators/mean_op.h"
@@ -35,23 +36,11 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
     T* data = tensor->mutable_data<T>(context.GetPlace());
     int64_t size = tensor->numel();
     std::normal_distribution<T> dist(mean, std);
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
 
-    if (framework::Generator::GetInstance()->is_init_py) {
-      std::mt19937_64& gen_engine =
-          framework::Generator::GetInstance()->GetCPUEngine();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(gen_engine);
-      }
-    } else {
-      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-      std::minstd_rand engine;
-      if (seed == 0) {
-        seed = std::random_device()();
-      }
-      engine.seed(seed);
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
     }
 
     tensor->set_layout(DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index bde7131379a272e31fb1effe2f92204fa27f9a14..e3da79125be24f3156b10a4d1daedd3db2b841cf 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -24,49 +24,69 @@ class AdadeltaOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"),
-                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
-                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Param) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Grad) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("AvgSquaredGrad"), true,
+        platform::errors::InvalidArgument(
+            "Input(AvgSquaredGrad) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("AvgSquaredUpdate"), true,
+        platform::errors::InvalidArgument(
+            "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
         ctx->GetInputsVarType("Param").front() ==
             framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
+        true,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Param").front(),
+            ctx->GetInputsVarType("Param").front()));
+    PADDLE_ENFORCE_EQ(
         ctx->GetInputsVarType("Grad").front() ==
             framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredGradOut"),
-        "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredUpdateOut"),
-        "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.");
+        true,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Grad").front(),
+            ctx->GetInputsVarType("Grad").front()));
+
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("ParamOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(ParamOut) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("AvgSquaredGradOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(AvgSquaredGradOut) of AdadeltaOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("AvgSquaredUpdateOut"), true,
+        platform::errors::InvalidArgument(
+            "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."));
 
     auto param_dim = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
         param_dim, ctx->GetInputDim("Grad"),
         "param and grad input of AdadeltaOp should have same dimension");
-    PADDLE_ENFORCE_NE(framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
-                      "Maybe the Input variable AvgSquaredGrad has not "
-                      "been initialized. You may need to confirm if you put "
-                      "exe.run(startup_program) after optimizer.minimize "
-                      "function.");
+    PADDLE_ENFORCE_NE(
+        framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
+        platform::errors::InvalidArgument(
+            "Maybe the Input variable AvgSquaredGrad has not "
+            "been initialized. You may need to confirm if you put "
+            "exe.run(startup_program) after optimizer.minimize "
+            "function."));
     PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
-                      "Param and AvgSquaredGrad input of AdadeltaOp "
-                      "should have same dimension");
+                      platform::errors::InvalidArgument(
+                          "Param and AvgSquaredGrad input of AdadeltaOp "
+                          "should have same dimension"));
     PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
-                      "Param and AvgSquaredUpdate input of AdadeltaOp "
-                      "should have same dimension");
+                      platform::errors::InvalidArgument(
+                          "Param and AvgSquaredUpdate input of AdadeltaOp "
+                          "should have same dimension"));
 
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h
index e66dec7cf0ff686f91103e438b6374fce29af774..85cfad35858bbe6b112169f196c0711d981e9446 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.h
+++ b/paddle/fluid/operators/optimizers/adadelta_op.h
@@ -24,17 +24,19 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
     const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto avg_squared_grad_out_tensor =
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
index 92ce600f22b64f82a053233dbd130adefca964fa..7f0b2b7d064ed12875577fee2265ab17c1fce08f 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
@@ -25,15 +25,11 @@ class DGCMomentumOp : public MomentumOp {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("current_step"), true,
-                      "current_step should be set.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true,
-                      platform::errors::NotFound(
-                          "Input(nranks) of DGCMomentumOp is not found."));
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Grad_out"), true,
-                      platform::errors::NotFound(
-                          "Output(Grad_out) of DGCMomentumOp is not found."));
+    OP_INOUT_CHECK(ctx->HasInput("current_step"), "Input", "current_step",
+                   "DGCMomentumOp");
+    OP_INOUT_CHECK(ctx->HasInput("nranks"), "Input", "nranks", "DGCMomentumOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Grad_out"), "Output", "Grad_out",
+                   "DGCMomentumOp");
     return MomentumOp::InferShape(ctx);
   }
 
diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc
index aa39821051eed11c3aa02c4baabfef539d7d7692..59035d5a8ca5d4214f1370e1b14b2be9b234fa6a 100644
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
@@ -42,6 +42,11 @@ class PnormOpMaker : public framework::OpProtoAndCheckerMaker {
         "keepdim",
         "(bool, default false) Whether to keep the dimensions as the input.")
         .SetDefault(false);
+
+    AddAttr<bool>("asvector",
+                  "(bool, default false) as vector norm when axis is None and "
+                  "input is matrix, ")
+        .SetDefault(false);
     AddOutput("Out", "(Tensor) Output result tensor of p-norm");
     AddComment(R"DOC(
 Pnorm Operator.
@@ -96,10 +101,15 @@ class PnormOp : public framework::OperatorWithKernel {
                           "Current Input(X)'s shape is=[%s].",
                           axis, x_rank, x_dim));
 
-    if (axis < 0) axis = x_dim.size() + axis;
     std::vector<int> reduce_dims;
-    for (int i = 0; i < x_dim.size(); ++i) {
-      if (i != axis) reduce_dims.emplace_back(x_dim[i]);
+    bool asvector = ctx->Attrs().Get<bool>("asvector");
+    if (asvector) {
+      reduce_dims.emplace_back(1);
+    } else {
+      if (axis < 0) axis = x_dim.size() + axis;
+      for (int i = 0; i < x_dim.size(); ++i) {
+        if (i != axis) reduce_dims.emplace_back(x_dim[i]);
+      }
     }
     x_dim[axis] = 1;
 
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index 63f2a1c56c12522bc8a029e392ff02f5a28b45df..ba0d46f4c73ec2683e51722033713c5cb3736643 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -129,9 +129,10 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
     auto ndim = out_norm->dims();
     float porder = ctx.Attr<float>("porder");
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
 
     auto& dev_ctx = ctx.cuda_device_context();
 
@@ -230,9 +231,10 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
     float porder = ctx.Attr<float>("porder");
     T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
 
     auto& dev_ctx = ctx.cuda_device_context();
 
diff --git a/paddle/fluid/operators/p_norm_op.h b/paddle/fluid/operators/p_norm_op.h
index 7620d1421e897f1a62ddf3a6c6e725e5a0f38bf0..8fca6924a2541d052bb2ebce0225ba5522ff6fd5 100644
--- a/paddle/fluid/operators/p_norm_op.h
+++ b/paddle/fluid/operators/p_norm_op.h
@@ -20,15 +20,19 @@ namespace paddle {
 namespace operators {
 
 inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
-                    int* post) {
+                    int* post, bool asvector) {
   *pre = 1;
   *post = 1;
   *n = dim[axis];
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= dim[i];
-  }
-  for (int i = axis + 1; i < dim.size(); ++i) {
-    (*post) *= dim[i];
+  if (asvector) {
+    *n = product(dim);
+  } else {
+    for (int i = 0; i < axis; ++i) {
+      (*pre) *= dim[i];
+    }
+    for (int i = axis + 1; i < dim.size(); ++i) {
+      (*post) *= dim[i];
+    }
   }
 }
 
@@ -43,9 +47,10 @@ class PnormKernel : public framework::OpKernel<T> {
     auto xdim = in_x->dims();
     float porder = ctx.Attr<float>("porder");
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
 
     auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
@@ -91,9 +96,10 @@ class PnormGradKernel : public framework::OpKernel<T> {
     float porder = ctx.Attr<float>("porder");
 
     int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
     if (axis < 0) axis = xdim.size() + axis;
     int pre, n, post;
-    GetDims(xdim, axis, &pre, &n, &post);
+    GetDims(xdim, axis, &pre, &n, &post, asvector);
     Eigen::DSizes<int, 3> shape(pre, n, post);
     Eigen::DSizes<int, 3> rshape(pre, 1, post);
 
diff --git a/paddle/fluid/operators/randint_op.cc b/paddle/fluid/operators/randint_op.cc
index 662fe3bcb3b3b2d26afaef0c9388dda329aea645..b3a2e14331955f42761601cabe52311e40c28eaa 100644
--- a/paddle/fluid/operators/randint_op.cc
+++ b/paddle/fluid/operators/randint_op.cc
@@ -46,22 +46,11 @@ class CPURandintKernel : public framework::OpKernel<T> {
 
     std::uniform_int_distribution<T> dist(ctx.Attr<int>("low"),
                                           ctx.Attr<int>("high") - 1);
+    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
 
-    if (framework::Generator::GetInstance()->is_init_py) {
-      std::mt19937_64& gen_engine =
-          framework::Generator::GetInstance()->GetCPUEngine();
-      for (int64_t i = 0; i < size; ++i) data[i] = dist(gen_engine);
-    } else {
-      unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-      std::minstd_rand engine;
-      if (seed == 0) {
-        seed = std::random_device()();
-      }
-      engine.seed(seed);
-
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
     }
   }
 };
diff --git a/paddle/fluid/operators/randperm_op.h b/paddle/fluid/operators/randperm_op.h
index 0eb028ad806848a559ba51b9c950d324a598a851..02aabb9a7b569c5aa8354f191c0de1497d9f9324 100644
--- a/paddle/fluid/operators/randperm_op.h
+++ b/paddle/fluid/operators/randperm_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <ctime>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -29,20 +30,12 @@ namespace operators {
 
 template <typename T>
 static inline void random_permate(T* data_ptr, int num, unsigned int seed) {
+  auto engine = framework::GetCPURandomEngine(seed);
   for (int i = 0; i < num; ++i) {
     data_ptr[i] = static_cast<T>(i);
   }
-  if (framework::Generator::GetInstance()->is_init_py) {
-    std::shuffle(data_ptr, data_ptr + num,
-                 framework::Generator::GetInstance()->GetCPUEngine());
 
-  } else {
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    std::srand(seed);
-    std::random_shuffle(data_ptr, data_ptr + num);
-  }
+  std::shuffle(data_ptr, data_ptr + num, *engine);
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index a09220b1ccd13604b6d842237c8176578967ac64..9bec08f593afebae736fcbb3eb42fc20992df779 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -51,20 +51,15 @@ class SamplingIdKernel : public framework::OpKernel<T> {
     framework::TensorToVector(*input, context.device_context(), &ins_vector);
 
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
+
     std::uniform_real_distribution<T> dist(
         static_cast<T>(context.Attr<float>("min")),
         static_cast<T>(context.Attr<float>("max")));
 
+    auto engine = framework::GetCPURandomEngine(seed);
     std::vector<int64_t> ids(batch_size);
     for (int i = 0; i < batch_size; ++i) {
-      T r = framework::Generator::GetInstance()->is_init_py
-                ? dist(framework::Generator::GetInstance()->GetCPUEngine())
-                : dist(engine);
+      T r = dist(*engine);
       int idx = width - 1;
       for (int j = 0; j < width; ++j) {
         if ((r -= ins_vector[i * width + j]) < 0) {
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 1fbf6d00ef763f4cb608be6d62cf4bff54f620ec..d3f9754d307c6040a66a3452d7bb008159ff46e5 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -23,22 +23,27 @@ class TopkOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
-                   "Output(Indices) of TopkOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of TopkOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Out) of TopkOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Indices"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Indices) of TopkOp should not be null."));
 
     auto input_dims = ctx->GetInputDim("X");
     const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
 
     PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
+    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
+                                                "input must have >= 1d shape"));
 
     if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
-                        "input must have >= k columns");
+      PADDLE_ENFORCE_GE(
+          input_dims[input_dims.size() - 1], k,
+          platform::errors::InvalidArgument("input must have >= k columns"));
     }
 
     framework::DDim dims = input_dims;
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index d8b2e92616091a8c822c6fd0bfdfb1148c25534d..0a694e1ad5b012d70a89ddcca2d70fbe8c9e24ba 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -43,8 +43,9 @@ template <typename DeviceContext, typename T>
 class TopkOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("It must use CUDAPlace."));
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu
index 5154503292014fa43efae919f20d731060b9db57..2c94dca1e3a461a44b98e9acf604cc4b488b5fd7 100644
--- a/paddle/fluid/operators/top_k_v2_op.cu
+++ b/paddle/fluid/operators/top_k_v2_op.cu
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/p_norm_op.h"
 #include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/fluid/operators/top_k_v2_op.h"
 
diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h
index a77285d123644e6ea2b9077f3338b92add42f7f0..89b5d36b1b3f915e8719c8791e8c12c2e0348f26 100644
--- a/paddle/fluid/operators/top_k_v2_op.h
+++ b/paddle/fluid/operators/top_k_v2_op.h
@@ -33,6 +33,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
+                    int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
 template <typename T, typename Type>
 static void FullTopK(Type input_height, Type input_width, int input_dim,
                      const framework::Tensor* input, T* t_out, Type* t_indices,
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index 3aa9ff544af63993521d41604cecef0b283ebc1e..419f0f7a2a57822b422986f9b676de138a7404cd 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <limits>
 #include <random>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -167,22 +168,10 @@ class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     TruncatedNormal<T> truncated_normal(mean, std);
     int64_t size = tensor->numel();
 
-    if (framework::Generator::GetInstance()->is_init_py) {
-      std::mt19937_64& gen_engine =
-          framework::Generator::GetInstance()->GetCPUEngine();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = truncated_normal(dist(gen_engine));
-      }
-    } else {
-      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-      std::minstd_rand engine;
-      if (seed == 0) {
-        seed = std::random_device()();
-      }
-      engine.seed(seed);
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = truncated_normal(dist(engine));
-      }
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = truncated_normal(dist(*engine));
     }
   }
 };
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index a4487cde277990a725fd4c37b6d807278e314343..9cffe09a33abf29308072d6b3c8bfb8a636048da 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/uniform_random_op.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -62,34 +64,12 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
     std::uniform_real_distribution<T> dist(
         static_cast<T>(ctx.Attr<float>("min")),
         static_cast<T>(ctx.Attr<float>("max")));
-    auto gen_ptr = framework::Generator::GetInstance();
-    if (gen_ptr->is_init_py) {
-      std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
-      // auto gen_engine = gen_ptr_->GetCPUEngine();
-      // std::uniform_real_distribution<T> dist(
-      //    static_cast<T>(ctx.Attr<float>("min")),
-      //    static_cast<T>(ctx.Attr<float>("max")));
+    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
 
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(gen_engine);
-      }
-    } else {
-      unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-      std::minstd_rand engine;
-      if (seed == 0) {
-        seed = std::random_device()();
-      }
-      engine.seed(seed);
-      // std::uniform_real_distribution<T> dist(
-      //    static_cast<T>(ctx.Attr<float>("min")),
-      //    static_cast<T>(ctx.Attr<float>("max")));
-      // int64_t size = tensor->numel();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
     }
-    // std::mt19937_64 &engine = gen_ptr->GetCPUEngine();
-    // auto engine = gen_ptr_->GetCPUEngine();
 
     unsigned int diag_num =
         static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
@@ -139,12 +119,12 @@ class UniformRandomOp : public framework::OperatorWithKernel {
     if (ctx->HasInputs("ShapeTensorList")) {
       // top prority shape
       auto inputs_name = ctx->Inputs("ShapeTensorList");
-      PADDLE_ENFORCE_GT(
-          inputs_name.size(), 0,
-          platform::errors::InvalidArgument(
-              "Input(ShapeTensorList)'size of Op(uniform_random) can't be zero."
-              "Please check the Attr(shape)'s size of"
-              "Op(fluid.layers.uniform_random).)"));
+      PADDLE_ENFORCE_GT(inputs_name.size(), 0,
+                        platform::errors::InvalidArgument(
+                            "Input(ShapeTensorList)'size of "
+                            "Op(uniform_random) can't be zero."
+                            "Please check the Attr(shape)'s size of"
+                            "Op(fluid.layers.uniform_random).)"));
       auto out_dims = std::vector<int>(inputs_name.size(), -1);
       ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
 
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index c024bb87b09c00c34dbaaf7b747f29743152502f..4df1e0ffeb97564803f452114d52ab03d0464f8a 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -88,15 +89,12 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    if (framework::Generator::GetInstance()->is_init_py) {
-      seed = static_cast<unsigned int>(
-          framework::Generator::GetInstance()->GetCurrentSeed());
-    } else {
-      if (seed == 0) {
-        std::random_device rd;
-        seed = rd();
-      }
+
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
     }
+
     T min = static_cast<T>(context.Attr<float>("min"));
     T max = static_cast<T>(context.Attr<float>("max"));
     unsigned int diag_num =
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
index 1aea96a15eb090ccd1a508641e4c6c0a8dcf7fb9..745102dd28d3d578ec3674221645fc1e8bdfe43a 100644
--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -149,3 +150,34 @@ REGISTER_OP_CPU_KERNEL(
     ops::UniqueKernel<paddle::platform::CPUDeviceContext, double>,
     ops::UniqueKernel<paddle::platform::CPUDeviceContext, int32_t>,
     ops::UniqueKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_VERSION(unique)
+    .AddCheckpoint(
+        R"ROC(
+        Upgrade unique, add 2 outputs [Indices, Counts] and 5 attribute
+        [return_index, return_inverse, return_counts, axis, is_sorted].
+      )ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewOutput("Indices",
+                       "The indices of the input tensor that result in the "
+                       "unique tensor.")
+            .NewOutput("Counts", "The counts for each unique element.")
+            .NewAttr("return_index",
+                     "If True, also return the indices of the input"
+                     " tensor that result in the unique Tensor.",
+                     false)
+            .NewAttr("return_inverse",
+                     "If True, also return the indices for where elements"
+                     " in the original input ended up in the returned unique "
+                     "tensor.",
+                     false)
+            .NewAttr("return_counts",
+                     "If True, also return the counts for each unique element.",
+                     false)
+            .NewAttr("axis",
+                     "The axis to apply unique. If None, the input will be "
+                     "flattened.",
+                     {})
+            .NewAttr("is_sorted",
+                     "If True, the unique elements of X are in ascending order."
+                     "Otherwise, the unique elements are not sorted.",
+                     false));
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index dc8b2ac5555126d8cf2bb92d2f506b1bf358e680..2bd2a2cbf34c6ccba1e6bfd1892f0f821d0f7c72 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -131,22 +131,22 @@ static bool Equal(const framework::Tensor& a, const framework::Tensor& b) {
   return true;
 }
 
-template <typename T>
+template <typename InT, typename IndexT>
 static void UniqueFlattendTensor(const framework::ExecutionContext& context,
                                  const framework::Tensor& in,
                                  framework::Tensor* out, bool return_index,
                                  bool return_inverse, bool return_counts) {
-  const T* in_data = in.data<T>();
-  std::set<T> unique(in_data, in_data + in.numel());
+  const InT* in_data = in.data<InT>();
+  std::set<InT> unique(in_data, in_data + in.numel());
   out->Resize(framework::make_ddim({static_cast<int64_t>(unique.size())}));
-  auto out_data = out->mutable_data<T>(context.GetPlace());
+  auto out_data = out->mutable_data<InT>(context.GetPlace());
   std::copy(unique.begin(), unique.end(), out_data);
 
   if (return_index) {
     auto* indices = context.Output<framework::Tensor>("Indices");
     indices->Resize(framework::make_ddim({out->numel()}));
-    auto indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    std::unordered_map<T, int64_t> indices_map;
+    auto indices_data = indices->mutable_data<IndexT>(context.GetPlace());
+    std::unordered_map<InT, IndexT> indices_map;
     indices_map.reserve(out->numel());
     for (int64_t i = 0; i < in.numel(); ++i) {
       if (indices_map.find(in_data[i]) != indices_map.end()) continue;
@@ -160,8 +160,8 @@ static void UniqueFlattendTensor(const framework::ExecutionContext& context,
   if (return_inverse) {
     auto* inverse = context.Output<framework::Tensor>("Index");
     inverse->Resize(framework::make_ddim({in.numel()}));
-    auto inverse_data = inverse->mutable_data<int64_t>(context.GetPlace());
-    std::unordered_map<T, int64_t> inverse_map;
+    auto inverse_data = inverse->mutable_data<IndexT>(context.GetPlace());
+    std::unordered_map<InT, IndexT> inverse_map;
     inverse_map.reserve(out->numel());
     for (int64_t i = 0; i < out->numel(); ++i) {
       inverse_map[out_data[i]] = i;
@@ -174,8 +174,8 @@ static void UniqueFlattendTensor(const framework::ExecutionContext& context,
   if (return_counts) {
     auto* count = context.Output<framework::Tensor>("Counts");
     count->Resize(framework::make_ddim({out->numel()}));
-    auto count_data = count->mutable_data<int64_t>(context.GetPlace());
-    std::unordered_map<T, int64_t> counts_map;
+    auto count_data = count->mutable_data<IndexT>(context.GetPlace());
+    std::unordered_map<InT, IndexT> counts_map;
     counts_map.reserve(out->numel());
     for (int64_t i = 0; i < out->numel(); ++i) {
       counts_map[out_data[i]] = 0;
@@ -189,13 +189,13 @@ static void UniqueFlattendTensor(const framework::ExecutionContext& context,
   }
 }
 
-template <class ForwardIt, typename T>
+template <class ForwardIt, typename InT, typename IndexT>
 static ForwardIt UniqueDimImpl(const framework::ExecutionContext& context,
                                ForwardIt first, ForwardIt last,
-                               const std::vector<int64_t>& sorted_indices_vec,
-                               std::vector<int64_t>* inverse_vec,
-                               std::vector<int64_t>* counts_vec,
-                               std::vector<int64_t>* indices_vec) {
+                               const std::vector<IndexT>& sorted_indices_vec,
+                               std::vector<IndexT>* inverse_vec,
+                               std::vector<IndexT>* counts_vec,
+                               std::vector<IndexT>* indices_vec) {
   if (first == last) {
     return last;
   }
@@ -210,7 +210,7 @@ static ForwardIt UniqueDimImpl(const framework::ExecutionContext& context,
   while (++first != last) {
     int64_t idx_first = std::distance(begin, first);
     int64_t idx_result = std::distance(begin, result);
-    if (!Equal<T>(*result, *first)) {
+    if (!Equal<InT>(*result, *first)) {
       if (++result != first) {
         *result = std::move(*first);
       }
@@ -223,7 +223,7 @@ static ForwardIt UniqueDimImpl(const framework::ExecutionContext& context,
   return ++result;
 }
 
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename InT, typename IndexT>
 static void UniqueDim(const framework::ExecutionContext& context,
                       const framework::Tensor& in, framework::Tensor* out,
                       bool return_index, bool return_inverse,
@@ -239,25 +239,25 @@ static void UniqueDim(const framework::ExecutionContext& context,
   framework::Tensor in_trans;
   framework::DDim in_trans_dims = framework::make_ddim(in_trans_dims_vec);
   in_trans.Resize(in_trans_dims);
-  in_trans.mutable_data<T>(context.GetPlace());
+  in_trans.mutable_data<InT>(context.GetPlace());
   auto& dev_ctx = context.template device_context<DeviceContext>();
-  TransCompute<DeviceContext, T>(in.dims().size(), dev_ctx, in, &in_trans,
-                                 permute);
+  TransCompute<DeviceContext, InT>(in.dims().size(), dev_ctx, in, &in_trans,
+                                   permute);
   // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
   framework::DDim in_trans_flat_dims =
       framework::flatten_to_2d(in_trans_dims, 1);
   in_trans.Resize(in_trans_flat_dims);
 
   // sort indices
-  std::vector<int64_t> sorted_indices_vec(in_trans.dims()[0]);
+  std::vector<IndexT> sorted_indices_vec(in_trans.dims()[0]);
   std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
   int64_t col = in_trans.dims()[1];
-  const T* in_trans_data = in_trans.data<T>();
+  const InT* in_trans_data = in_trans.data<InT>();
   std::sort(sorted_indices_vec.begin(), sorted_indices_vec.end(),
             [&](int64_t a, int64_t b) -> bool {
               for (int64_t i = 0; i < col; ++i) {
-                T lhs = in_trans_data[i + a * col];
-                T rhs = in_trans_data[i + b * col];
+                InT lhs = in_trans_data[i + a * col];
+                InT rhs = in_trans_data[i + b * col];
                 if (lhs < rhs) {
                   return true;
                 } else if (lhs > rhs) {
@@ -270,18 +270,19 @@ static void UniqueDim(const framework::ExecutionContext& context,
   // sort tensor according to indices
   framework::Tensor input_sorted;
   input_sorted.Resize(in_trans_dims);
-  input_sorted.mutable_data<T>(context.GetPlace());
-  T* input_sorted_data = input_sorted.data<T>();
+  input_sorted.mutable_data<InT>(context.GetPlace());
+  InT* input_sorted_data = input_sorted.data<InT>();
   for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
     memcpy(input_sorted_data + i * col,
-           in_trans_data + sorted_indices_vec[i] * col, col * sizeof(T));
+           in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
+           col * sizeof(InT));
   }
 
   std::vector<framework::Tensor> input_unbind = Unbind(input_sorted);
-  std::vector<int64_t> inverse_vec(sorted_indices_vec.size(), 0);
-  std::vector<int64_t> counts_vec(sorted_indices_vec.size(), 0);
-  std::vector<int64_t> indices_vec(sorted_indices_vec.size(), 0);
-  auto last = UniqueDimImpl<std::vector<framework::Tensor>::iterator, T>(
+  std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> indices_vec(sorted_indices_vec.size(), 0);
+  auto last = UniqueDimImpl<std::vector<framework::Tensor>::iterator, InT>(
       context, input_unbind.begin(), input_unbind.end(), sorted_indices_vec,
       &inverse_vec, &counts_vec, &indices_vec);
   input_unbind.erase(last, input_unbind.end());
@@ -289,18 +290,18 @@ static void UniqueDim(const framework::ExecutionContext& context,
   indices_vec.erase(indices_vec.begin() + input_unbind.size(),
                     indices_vec.end());
 
-  math::ConcatFunctor<DeviceContext, T> concat_functor;
+  math::ConcatFunctor<DeviceContext, InT> concat_functor;
   framework::Tensor out_trans;
   std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = input_unbind.size();
   out_trans.Resize(framework::make_ddim(out_trans_dims_vec));
-  out_trans.mutable_data<T>(context.GetPlace());
+  out_trans.mutable_data<InT>(context.GetPlace());
   std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
   out->Resize(framework::make_ddim(out_trans_dims_vec));
-  out->mutable_data<T>(context.GetPlace());
+  out->mutable_data<InT>(context.GetPlace());
   concat_functor(dev_ctx, input_unbind, 0, &out_trans);
-  TransCompute<DeviceContext, T>(out_trans.dims().size(), dev_ctx, out_trans,
-                                 out, permute);
+  TransCompute<DeviceContext, InT>(out_trans.dims().size(), dev_ctx, out_trans,
+                                   out, permute);
 
   if (return_inverse) {
     auto* inverse = context.Output<framework::Tensor>("Index");
@@ -318,15 +319,80 @@ static void UniqueDim(const framework::ExecutionContext& context,
   }
 }
 
+template <typename DeviceContext, typename InT>
+struct UniqueFlattendTensorFunctor {
+  const framework::ExecutionContext& ctx_;
+  const framework::Tensor& in_;
+  framework::Tensor* out_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+
+  UniqueFlattendTensorFunctor(const framework::ExecutionContext& context,
+                              const framework::Tensor& in,
+                              framework::Tensor* out, bool return_index,
+                              bool return_inverse, bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueFlattendTensor<InT, IndexT>(ctx_, in_, out_, return_index_,
+                                      return_inverse_, return_counts_);
+  }
+};
+
+template <typename DeviceContext, typename InT>
+struct UniqueDimFunctor {
+  const framework::ExecutionContext& ctx_;
+  const framework::Tensor& in_;
+  framework::Tensor* out_;
+  const int axis_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+
+  UniqueDimFunctor(const framework::ExecutionContext& context,
+                   const framework::Tensor& in, framework::Tensor* out,
+                   const int axis, bool return_index, bool return_inverse,
+                   bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        axis_(axis),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueDim<DeviceContext, InT, IndexT>(
+        ctx_, in_, out_, return_index_, return_inverse_, return_counts_, axis_);
+  }
+};
+
 template <typename DeviceContext, typename T>
 class UniqueKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+    if (data_type == framework::proto::VarType::INT32) {
+      PADDLE_ENFORCE_LE(
+          x->numel(), INT_MAX,
+          platform::errors::InvalidArgument(
+              "The number of elements in Input(X) should be less than or "
+              "equal to INT_MAX, but received num is %d. Please set `dtype` to "
+              "int64.",
+              x->numel()));
+    }
     if (!context.Attr<bool>("is_sorted")) {
-      auto data_type = static_cast<framework::proto::VarType::Type>(
-          context.Attr<int>("dtype"));
       auto* index = context.Output<framework::Tensor>("Index");
 
       framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
@@ -339,12 +405,16 @@ class UniqueKernel : public framework::OpKernel<T> {
     bool return_counts = context.Attr<bool>("return_counts");
 
     if (axis_vec.empty()) {
-      UniqueFlattendTensor<T>(context, *x, out, return_index, return_inverse,
-                              return_counts);
+      framework::VisitDataTypeSmall(
+          data_type,
+          UniqueFlattendTensorFunctor<DeviceContext, T>(
+              context, *x, out, return_index, return_inverse, return_counts));
     } else {
       int axis = axis_vec[0];
-      UniqueDim<DeviceContext, T>(context, *x, out, return_index,
-                                  return_inverse, return_counts, axis);
+      framework::VisitDataTypeSmall(
+          data_type, UniqueDimFunctor<DeviceContext, T>(
+                         context, *x, out, axis, return_index, return_inverse,
+                         return_counts));
     }
   }
 };
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 8667375c6f2726f1099c6e57c6e793252b01d454..af8798a4b7cf5a8832ce9345cad45ce3096484e4 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -508,3 +508,16 @@ DEFINE_int32(
     "summary will be shown."
     "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
     "error message summary will be shown.");
+
+/**
+ * Debug related FLAG
+ * Name: sort_sum_gradient
+ * Since Version: 2.0.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: If True, gradients are summed by the reverse order of
+ * the forward execution sequence.
+ */
+DEFINE_bool(sort_sum_gradient, false,
+            "Sum gradients by the reverse order of "
+            "the forward execution sequence.");
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 3bccd5fb2dd92298323381c09467937abd87a53c..90b7f501052530a306ba22ea6a244f0ef8fad563 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -29,23 +29,36 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace pybind {
-void BindGenerator(py::module* m) {
-  py::class_<framework::GeneratorState>(*m, "GeneratorState", "");
-  py::class_<std::mt19937_64>(*m, "mt19937_64", "");
+void BindGenerator(py::module* m_ptr) {
+  auto& m = *m_ptr;
+  py::class_<framework::GeneratorState,
+             std::shared_ptr<framework::GeneratorState>>(m, "GeneratorState")
+      .def("current_seed",
+           [](std::shared_ptr<framework::GeneratorState>& self) {
+             return self->current_seed;
+           });
+  py::class_<std::mt19937_64>(m, "mt19937_64", "");
   py::class_<framework::Generator, std::shared_ptr<framework::Generator>>(
-      *m, "Generator")
-      .def(py::init([]() { return framework::Generator::GetInstanceX(); }),
-           py::return_value_policy::reference)
-      .def("get_state", &framework::Generator::GetState,
-           py::return_value_policy::move)
+      m, "Generator")
+      .def("__init__",
+           [](framework::Generator& self) {
+             new (&self) framework::Generator();
+           })
+      .def("get_state", &framework::Generator::GetState)
       .def("set_state", &framework::Generator::SetState)
-      .def("manual_seed", &framework::Generator::SetCurrentSeed)
+      .def("manual_seed",
+           [](std::shared_ptr<framework::Generator>& self, uint64_t seed) {
+             self->SetCurrentSeed(seed);
+             return self;
+           })
       .def("seed", &framework::Generator::Seed)
       .def("initial_seed", &framework::Generator::GetCurrentSeed)
       .def("random", &framework::Generator::Random64)
-      .def("get_cpu_engine", &framework::Generator::GetCPUEngine,
-           py::return_value_policy::move)
-      .def("set_cpu_engine", &framework::Generator::SetCPUEngine);
+      //  .def("get_cpu_engine", &framework::Generator::GetCPUEngine)
+      //  .def("set_cpu_engine", &framework::Generator::SetCPUEngine)
+      .def_property("_is_init_py", &framework::Generator::GetIsInitPy,
+                    &framework::Generator::SetIsInitPy);
+  m.def("default_cpu_generator", &framework::DefaultCPUGenerator);
 }  // end Generator
 }  // end namespace pybind
-}  // end namespace paddle
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index deca9625e63d05625c407a1282b396398bb78ccc..f1084018d9c79e46c33098dafdb48dc395dac652 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -38,6 +38,7 @@ DECLARE_bool(enable_rpc_profiler);
 DECLARE_int32(multiple_of_cupti_buffer_size);
 DECLARE_bool(reader_queue_speed_test_mode);
 DECLARE_int32(call_stack_level);
+DECLARE_bool(sort_sum_gradient);
 // device management
 DECLARE_int32(paddle_num_threads);
 // executor
@@ -340,7 +341,7 @@ static void RegisterGlobalVarGetterSetter() {
   REGISTER_PUBLIC_GLOBAL_VAR(
       FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
       FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
-      FLAGS_call_stack_level, FLAGS_cpu_deterministic,
+      FLAGS_call_stack_level, FLAGS_sort_sum_gradient, FLAGS_cpu_deterministic,
       FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
       FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
       FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 021d10ca7facb0bac11cd5d08eddea7e01b9b566..489dd198876204486fc94518fbef0c806d0543d4 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -30,7 +30,6 @@ limitations under the License. */
 
 #include "paddle/fluid/imperative/all_reduce.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
-#include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/data_loader.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -507,50 +506,6 @@ void BindImperative(py::module *m_ptr) {
         []() { memory::allocation::MemoryMapFdSet::Instance().Clear(); });
 #endif
 
-  py::class_<imperative::detail::BackwardStrategy> backward_strategy(
-      m, "BackwardStrategy", R"DOC(
-
-    BackwardStrategy is a descriptor of how to run the backward process.
-
-    **Note**:
-        **This API is only available in** `Dygraph <../../user_guides/howto/dygraph/DyGraph.html>`_ **Mode**
-
-    Attribute:
-        **sort_sum_gradient**:
-
-        If framework will sum the gradient by the reverse order of trace. eg. x_var ( :ref:`api_guide_Variable` ) will be the input of multiple OP such as :ref:`api_fluid_layers_scale` , this attr will decide if framework will sum gradient of `x_var` by the reverse order.
-
-        By Default: False
-
-        Examples:
-            .. code-block:: python
-
-                import numpy as np
-                import paddle.fluid as fluid
-
-                x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    x_var = fluid.dygraph.to_variable(x)
-                    sums_inputs = []
-                    # x_var will be multi-scales' input here
-                    for _ in range(10):
-                        sums_inputs.append(fluid.layers.scale(x_var))
-                    ret2 = fluid.layers.sums(sums_inputs)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
-      )DOC");
-  backward_strategy.def(py::init())
-      .def_property("sort_sum_gradient",
-                    [](const imperative::detail::BackwardStrategy &self) {
-                      return self.sorted_sum_gradient_;
-                    },
-                    [](imperative::detail::BackwardStrategy &self,
-                       bool sorted_sum_gradient) {
-                      self.sorted_sum_gradient_ = sorted_sum_gradient;
-                    });
-
   m.def("start_imperative_gperf_profiler",
         []() { imperative::StartProfile(); });
 
@@ -745,21 +700,18 @@ void BindImperative(py::module *m_ptr) {
                          inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
                     loss2.clear_gradient()
                     print("After clear {}".format(loss2.gradient()))
       )DOC")
       .def("_run_backward",
-           [](imperative::VarBase &self,
-              const imperative::detail::BackwardStrategy &bckst,
-              const imperative::Tracer &tracer, bool retain_graph) {
+           [](imperative::VarBase &self, const imperative::Tracer &tracer,
+              bool retain_graph) {
              // TODO(jiabin): when we impl more backward execution we can
              // select them
              auto *engine = tracer.GetEngine();
-             engine->Init(&self, bckst, retain_graph);
+             engine->Init(&self, retain_graph);
              VLOG(3) << "Start backward";
              engine->Execute();
              VLOG(3) << "Finish backward";
@@ -1024,13 +976,11 @@ void BindImperative(py::module *m_ptr) {
              &output_targets,
          const std::vector<std::shared_ptr<imperative::VarBase>> &output_grads,
          const std::vector<std::shared_ptr<imperative::VarBase>> &no_grad_vars,
-         const platform::Place &place,
-         const imperative::detail::BackwardStrategy &strategy,
-         bool create_graph, bool retain_graph, bool allow_unused,
-         bool only_inputs) {
+         const platform::Place &place, bool create_graph, bool retain_graph,
+         bool allow_unused, bool only_inputs) {
         imperative::PartialGradEngine engine(
             input_targets, output_targets, output_grads, no_grad_vars, place,
-            strategy, create_graph, retain_graph, allow_unused, only_inputs);
+            create_graph, retain_graph, allow_unused, only_inputs);
         engine.Execute();
         return engine.GetResult();
       },
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 696da67c9c98fe16b28ceb05d5c07049104fd43b..040dd313f1c538b5792538f9da04635ff805b9a8 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -206,9 +206,9 @@ void BindInferenceApi(py::module *m) {
   BindMkldnnQuantizerConfig(m);
 #endif
   m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<AnalysisConfig>);
+         &paddle::CreatePaddlePredictor<AnalysisConfig>, py::arg("config"));
   m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<NativeConfig>);
+         &paddle::CreatePaddlePredictor<NativeConfig>, py::arg("config"));
   m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
   m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
 }
@@ -448,6 +448,7 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::cpu_math_library_num_threads)
       .def("to_native_config", &AnalysisConfig::ToNativeConfig)
       .def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16)
 #ifdef PADDLE_WITH_MKLDNN
       .def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config,
            py::return_value_policy::reference)
@@ -565,6 +566,7 @@ void BindPaddlePassBuilder(py::module *m) {
       .def("enable_cudnn", &PassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &PassStrategy::EnableMKLDNN)
       .def("enable_mkldnn_quantizer", &PassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &PassStrategy::EnableMkldnnBfloat16)
       .def("use_gpu", &PassStrategy::use_gpu);
 
   py::class_<CpuPassStrategy, PassStrategy>(*m, "CpuPassStrategy")
@@ -572,14 +574,16 @@ void BindPaddlePassBuilder(py::module *m) {
       .def(py::init<const CpuPassStrategy &>())
       .def("enable_cudnn", &CpuPassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &CpuPassStrategy::EnableMKLDNN)
-      .def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer);
+      .def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &CpuPassStrategy::EnableMkldnnBfloat16);
 
   py::class_<GpuPassStrategy, PassStrategy>(*m, "GpuPassStrategy")
       .def(py::init<>())
       .def(py::init<const GpuPassStrategy &>())
       .def("enable_cudnn", &GpuPassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &GpuPassStrategy::EnableMKLDNN)
-      .def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer);
+      .def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &GpuPassStrategy::EnableMkldnnBfloat16);
 }
 }  // namespace
 }  // namespace pybind
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index c84574b21d883b24e1f89c59c3a724aae6621479..862ab2e8db1fdc353db826204d759d99951d5142 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -125,8 +125,15 @@ echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
 
-echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
-cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
+echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
+-DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
+
+cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
+-DON_INFER=%ON_INFER%  -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
 goto:eof
 
 :cmake_error
@@ -276,7 +283,10 @@ echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_
 echo fi>>  check_change_of_unittest.sh
 echo git checkout -b origin_pr >>  check_change_of_unittest.sh
 echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% >>  check_change_of_unittest.sh
+echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
+-DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% >>  check_change_of_unittest.sh
 echo cat ^<^<EOF>>  check_change_of_unittest.sh
 echo     ============================================       >>  check_change_of_unittest.sh
 echo     Generate unit tests.spec of develop.               >>  check_change_of_unittest.sh
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 57defebd7575b41c031957aa9c2f848861a006ac..926747ef6186e3b9439baf787572fe9d1988fb46 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -880,6 +880,7 @@ set +x
         multiple_card_tests=''    # cases list which would take multiple GPUs, most cases would be two GPUs
         is_exclusive=''           # indicate whether the case is exclusive type
         is_multicard=''           # indicate whether the case is multiple GPUs type
+        is_nightly=''             # indicate whether the case will only run at night
         while read -r line; do
             if [[ "$line" == "" ]]; then
                 continue
@@ -889,12 +890,19 @@ set +x
                     # Any test case with LABELS property would be parse here
                     # RUN_TYPE=EXCLUSIVE mean the case would run exclusively
                     # RUN_TYPE=DIST mean the case would take two graph GPUs during runtime
+                    # RUN_TYPE=NIGHTLY or RUN_TYPE=DIST:NIGHTLY or RUN_TYPE=EXCLUSIVE:NIGHTLY means the case will ONLY run at night
                     read is_exclusive <<< $(echo "$line"|grep -oEi "RUN_TYPE=EXCLUSIVE")
                     read is_multicard <<< $(echo "$line"|grep -oEi "RUN_TYPE=DIST")
+                    read is_nightly <<< $(echo "$line"|grep -oEi "RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY")
                     continue
                 fi
                 read testcase <<< $(echo "$line"|grep -oEi "\w+$")
 
+                if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then
+                    echo $testcase" will only run at night."
+                    continue
+                fi
+
                 if [[ "$is_multicard" == "" ]]; then
                   # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
                   read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist")
@@ -930,6 +938,7 @@ set +x
                 fi
                 is_exclusive=''
                 is_multicard=''
+                is_nightly=''
                 matchstr=''
                 testcase=''
         done <<< "$test_cases";
@@ -1390,6 +1399,9 @@ function main() {
     local CMD=$1 
     local parallel_number=$2
     init
+    if [ "$CMD" != "assert_file_approvals" ];then
+      python ${PADDLE_ROOT}/tools/summary_env.py
+    fi
     case $CMD in
       build_only)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
old mode 100644
new mode 100755
index 4e1e04043ad7d2fd72bfe891b755a2503c2096b3..46b84697e5a61e164cbc826d5018db7a6d87f69f
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -225,14 +225,11 @@ from .framework import CPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPlace  #DEFINE_ALIAS
 from .framework import CUDAPinnedPlace  #DEFINE_ALIAS
 
-from .framework import BackwardStrategy  #DEFINE_ALIAS
 from .framework import to_variable  #DEFINE_ALIAS
 from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
-from .framework import prepare_context  #DEFINE_ALIAS
-from .framework import ParallelEnv  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
 
 from .framework import NoamDecay  #DEFINE_ALIAS
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
index bb9830132e987370022df3192060de3e908a2e85..3540ea06b075ed9b649af803c5a655a1e737723b 100644
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -42,9 +42,11 @@ class TestSentimentMethods(unittest.TestCase):
     def test_data_set(self):
         data_set = st.load_sentiment_data()
         last_label = -1
+
         for each in st.test():
             self.assertNotEqual(each[1], last_label)
             last_label = each[1]
+
         self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
         self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
         self.assertEqual(
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 34dd605f901b4357682dc514d59d110db74f9d5b..b7357eef7ad9a3abae7f9c1c09fdc00b409061ad 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -12,4 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from . import spawn
+from .spawn import spawn
+
+from . import parallel
+from .parallel import init_parallel_env
+from .parallel import get_rank
+from .parallel import get_world_size
+from paddle.fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
+from paddle.fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
+
+from . import collective
 from .collective import *
+
+# start multiprocess apis
+__all__ = ["spawn"]
+
+# dygraph parallel apis
+__all__ += [
+    "init_parallel_env",
+    "get_rank",
+    "get_world_size",
+    "prepare_context",
+    "ParallelEnv",
+]
+
+# collective apis
+__all__ += collective.__all__
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index b080fb17553d4a93a545f4ae781d786d82e26576..42ac68ba1a64de54f029878ceab08435c924d087 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -18,16 +18,15 @@ from .base.distributed_strategy import DistributedStrategy
 from .base.fleet_base import Fleet
 from .base.util_factory import UtilBase
 from .dataset import *
+#from . import metrics
 
 __all__ = [
     "DistributedStrategy",
     "UtilBase",
     "DatasetFactory",
-    "DatasetBase",
-    "InMemoryDataset",
-    "QueueDataset",
     "UserDefinedRoleMaker",
     "PaddleCloudRoleMaker",
+    "Fleet",
 ]
 
 fleet = Fleet()
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index bc6ce8c5e1c3f750318eb105729b88617af5d578..26063d1b8a9225aff63628bb37f433ec95257dc7 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -17,6 +17,8 @@ from paddle.distributed.fleet.proto import distributed_strategy_pb2
 from paddle.fluid.framework import Variable, set_flags, core
 import google.protobuf.text_format
 
+__all__ = ["DistributedStrategy"]
+
 
 def get_msg_dict(msg):
     res_dict = {}
@@ -623,6 +625,20 @@ class DistributedStrategy(object):
 
     @property
     def localsgd(self):
+        """
+        Indicating whether we are using Local SGD training. For more details, please refer to
+        [Don't Use Large Mini-Batches, Use Local SGD](https://arxiv.org/pdf/1808.07217.pdf),
+
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.localsgd = True # by default this is false
+
+        """
         return self.strategy.localsgd
 
     @localsgd.setter
@@ -634,6 +650,28 @@ class DistributedStrategy(object):
 
     @property
     def localsgd_configs(self):
+        """
+        Set LocalSGD training configurations. LocalSGD has a configurable
+        setting that can be configured through a dict.
+
+        **Notes**:
+            **k_steps(int)**: The local steps for training before parameter
+                synchronization. Default 1. If strategy.auto is set True, the
+                local steps will be calculated automatically during training.
+                The algorithm is referenced in this paper: 
+                [Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD](https://arxiv.org/pdf/1810.08313.pdf).
+                In this case, k_steps indicates the first local steps which
+                is suggested setting to 1.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.localsgd = True
+            strategy.localsgd_configs = {"k_steps": 4}
+        """
+
         return get_msg_dict(self.strategy.localsgd_configs)
 
     @localsgd_configs.setter
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index a6286bcca87fad1afddbd8af1e56dad05dab2578..f4a16d0de177f8a63271ef43f6716aa31443f06f 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -22,7 +22,7 @@ from .runtime_factory import RuntimeFactory
 from .util_factory import UtilFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
 
-__all__ = ['Fleet']
+#__all__ = ['Fleet']
 
 
 def _inited_runtime_handler_(func):
@@ -85,7 +85,7 @@ class Fleet(object):
         This function is responsible for the distributed architecture 
         what you want to run your code behind,such as Transpiler,
         Collective in PaddleCloudRoleMaker or UserDefinedRoleMaker 
-        
+
         """
         if isinstance(role_maker, RoleMakerBase):
             self._role_maker = role_maker
@@ -112,7 +112,7 @@ class Fleet(object):
         Returns:
             bool: True if this is the first node of worker,
                   False if not.
-        
+
         """
         return self._role_maker.is_first_worker()
 
@@ -200,7 +200,8 @@ class Fleet(object):
             bool: True if this is a node of server,
                   False if not.
         """
-        return self._role_maker.is_server()
+        return self._role_maker.is_server(
+        ) or self._role_maker._is_heter_worker()
 
     @property
     def util(self):
@@ -372,10 +373,10 @@ class Fleet(object):
                 can_not_apply_optimizer_list.append(opt)
         # combine recalled meta optimizers to be a valid meta optimizer
         meta_optimizer, graph_optimizer = \
-                self.strategy_compiler.generate_optimizer(
-                    loss, self._role_maker, self.user_defined_optimizer,
-                    self.user_defined_strategy, valid_optimizer_list,
-                    valid_graph_optimizer_list)
+            self.strategy_compiler.generate_optimizer(
+                loss, self._role_maker, self.user_defined_optimizer,
+                self.user_defined_strategy, valid_optimizer_list,
+                valid_graph_optimizer_list)
 
         valid_strategy = self.strategy_compiler._get_valid_strategy(
             self.user_defined_strategy, can_not_apply_optimizer_list)
diff --git a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
index 459070fcc4dbef3711c33b2932e8f1c88647aab5..f845b3fcd8953c44c8b5b857dac08be1c7269958 100755
--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ["MetaOptimizerFactory"]
-
 from ..meta_optimizers import *
 
 meta_optimizer_names = list(
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 6aeeb4a2896ea1d20390e463937aa07d3edd0204..25f2d0dd3f45855d9f337c6b7154db9cb5bbae45 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -14,15 +14,17 @@
 """Defination of Role Makers."""
 import os
 import numpy as np
+import warnings
 from multiprocessing import Process, Manager
 import paddle.fluid as fluid
 
-__all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+#__all__ = ['UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
 
 
 class Role:
     WORKER = 1
     SERVER = 2
+    HETER_WORKER = 3
 
 
 class RoleMakerBase(object):
@@ -40,6 +42,11 @@ class RoleMakerBase(object):
         self._role = None
         self._current_id = -1
 
+        # for heter parameter server mode
+        self._heter_trainer_endpoints = []
+        self._heter_trainer_device = "CPU"
+        self._is_heter_parameter_server_mode = False
+
         self._node_type = None
         self._node_type_comm = None
         self._all_comm = None
@@ -163,12 +170,58 @@ class RoleMakerBase(object):
         """
         print("warning: RoleMakerBase does not have barrier worker.")
 
+    def _is_heter_worker(self):
+        """
+        Return is_heter_worker() of current process
+        """
+        warnings.warn("RoleMakerBase does not have function: _is_heter_worker.")
+        return False
+
+    def _heter_worker_num(self):
+        """
+        Get current total heter-worker number.
+
+        Returns:
+            int: heter_worker number
+        """
+        warnings.warn(
+            "RoleMakerBase does not have function: _heter_worker_num.")
+        return 0
+
+    def _get_heter_worker_endpoints(self):
+        """
+        Returns:
+            string: all heter_trainers'endpoints
+        """
+        assert self._heter_trainer_endpoints != []
+        return self._heter_trainer_endpoints
+
+    def _get_heter_worker_endpoint(self):
+        """
+        Returns:
+            int: corresponding heter_trainer's endpoint
+
+        e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter)
+             then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer
+             and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainerr
+        """
+        assert self._heter_trainer_endpoints != []
+        return self._heter_trainer_endpoints[(self._current_id + 1) %
+                                             self._heter_worker_num()]
+
+    def _get_heter_worker_device(self):
+        """
+        Returns:
+            string: heter_trainer's device of current node, e.g: CPU/GPU/XPU
+        """
+        return self._heter_trainer_device.upper()
+
 
 class PaddleCloudRoleMaker(RoleMakerBase):
     def __init__(self, is_collective=False, **kwargs):
         super(PaddleCloudRoleMaker, self).__init__()
         self._is_collective = is_collective
-        self._init_gloo = False  #default no init gloo
+        self._init_gloo = False  # default no init gloo
         self._kwargs = kwargs
 
         self._role_is_generated = False
@@ -278,10 +331,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         """
         get index of current node
         """
-        if self.is_server():
-            return self.server_index()
-        elif self.is_worker():
-            return self.worker_index()
+        return self._current_id
 
     def worker_num(self):
         """
@@ -323,6 +373,22 @@ class PaddleCloudRoleMaker(RoleMakerBase):
             self.generate_role()
         return self._server_endpoints
 
+    def _heter_worker_num(self):
+        """
+        get heter worker nums
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._heter_trainers_num
+
+    def _is_heter_worker(self):
+        """
+        whether current process is heter worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.HETER_WORKER
+
     def _get_rank(self):
         """
         get current rank in all workers and pservers
@@ -342,17 +408,47 @@ class PaddleCloudRoleMaker(RoleMakerBase):
     def _ps_env(self):
         try:
             # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
-            # format: string(ip:port), eg. 127.0.0.1:6001
-            self._server_endpoints = os.environ[
-                "PADDLE_PSERVERS_IP_PORT_LIST"].split(",")
+            # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
+            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST",
+                                               "").split(",")
+            assert self._server_endpoints != ""
             self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                                "").split(",")
+            assert self._server_endpoints != ""
 
             trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
             training_role = os.environ["TRAINING_ROLE"]
 
-            if training_role not in ["TRAINER", "PSERVER"]:
-                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
+            if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
+                raise ValueError(
+                    "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
+                    format(training_role))
+
+            # For heter parameter server env setting
+            heter_trainer_eplist = os.getenv(
+                "PADDLE_HETER_TRAINER_IP_PORT_LIST", None)
+            heter_trainer_device = os.getenv("PADDLE_HETER_TRAINER_DEVICE",
+                                             None)
+            if heter_trainer_eplist and heter_trainer_device:
+                try:
+                    heter_trainer_eplist = os.environ[
+                        "PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",")
+                except:
+                    raise ValueError(
+                        "Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ."
+                    )
+
+                self._is_heter_parameter_server_mode = True
+                heter_trainers_num = len(heter_trainer_eplist)
+                current_node_device = heter_trainer_device.upper()
+                if current_node_device not in ["CPU", "GPU", "XPU"]:
+                    raise ValueError(
+                        "Heter Trainer doesn't support {} device now, please use CPU / GPU / XPU(KunLun)".
+                        format(heter_trainer_device))
+                self._heter_trainer_device = current_node_device
+            else:
+                self._is_heter_parameter_server_mode = False
+                heter_trainers_num = 0
 
             if training_role == "TRAINER":
                 role = Role.WORKER
@@ -365,17 +461,26 @@ class PaddleCloudRoleMaker(RoleMakerBase):
                 ip = os.environ["POD_IP"]
                 self._cur_endpoint = ip + ":" + port
                 current_id = self._server_endpoints.index(self._cur_endpoint)
+            elif training_role == "HETER_TRAINER":
+                role = Role.HETER_WORKER
+                cur_ip = os.environ["POD_IP"]
+                cur_port = os.environ["PADDLE_PORT"]
+                curr_endpoint = ":".join([cur_ip, cur_port])
+                current_id = heter_trainer_eplist.index(curr_endpoint)
             else:
-                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
-        except ValueError as ve:
+                raise ValueError(
+                    "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER")
+        except ValueError as e:
             raise ValueError(
-                "something wrong with PaddleCloud, please check environment")
+                "Something wrong with PaddleCloud, please check environment")
 
         self._trainers_num = trainers_num
         self._role = role
         self._current_id = current_id
         self._node_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
+        self._heter_trainers_num = heter_trainers_num
+        self._heter_trainer_endpoints = heter_trainer_eplist
 
     def _collective_env(self):
         self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index 075e8b6c4302d792606849fc2981e46ccead1e56..d98b2ef3e2a083861647b2847bafad3b08c86cfd 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -15,24 +15,10 @@ from .amp_optimizer import AMPOptimizer
 from .recompute_optimizer import RecomputeOptimizer
 from .gradient_merge_optimizer import GradientMergeOptimizer
 from .graph_execution_optimizer import GraphExecutionOptimizer
-from .async_optimizer import AsyncMetaOptimizer
+from .parameter_server_optimizer import ParameterServerOptimizer
 from .pipeline_optimizer import PipelineOptimizer
 from .localsgd_optimizer import LocalSGDOptimizer
 from .lars_optimizer import LarsOptimizer
-from .async_graph_execution_optimizer import AsyncGraphExecutionOptimizer
+from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
 from .dgc_optimizer import DGCOptimizer
 from .lamb_optimizer import LambOptimizer
-
-__all__ = [
-    'AMPOptimizer',
-    'RecomputeOptimizer',
-    'GradientMergeOptimizer',
-    'AsyncMetaOptimizer',
-    'GraphExecutionOptimizer',
-    'PipelineOptimizer',
-    'LocalSGDOptimizer',
-    'LarsOptimizer',
-    'AsyncGraphExecutionOptimizer',
-    'DGCOptimizer',
-    'LambOptimizer',
-]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 66db14209b4c57475c30c6dde083593e27f04ea0..b1952276e44cd1466bc443440505462924115ab7 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -14,8 +14,6 @@
 import paddle.fluid.contrib.mixed_precision as mixed_precision
 from .meta_optimizer_base import MetaOptimizerBase
 
-__all__ = ["AMPOptimizer"]
-
 
 class AMPOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index f34786f9dc309dd1f03319368bbc93ef1bfc03e3..f1c6defc5c982c7d56980642898aaa333c199bbe 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -15,8 +15,6 @@ from paddle.fluid.optimizer import Momentum, DGCMomentumOptimizer
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
-__all__ = ["DGCOptimizer"]
-
 
 class DGCOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index bd52179a35862241768ad5bd01eedf16732ad3b6..7db79ad7b5b7081172209faa2396d9f2a31bbdb3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -14,10 +14,6 @@
 from paddle.fluid.optimizer import GradientMergeOptimizer as GM
 from .meta_optimizer_base import MetaOptimizerBase
 
-__all__ = ["GradientMergeOptimizer"]
-
-# amp + gradient merge + lamb
-
 
 class GradientMergeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index 7e08a02eb1dc2e14b1871fe7743bbee8ade3feb3..9fa29c4078e9f579a740ef8c0591979e7fbb962d 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -16,8 +16,6 @@ from paddle.fluid.optimizer import LambOptimizer as LAMB
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
-__all__ = ["LambOptimizer"]
-
 
 class LambOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index 09c418fa79106d05cffae1e8bc18fac9c0cc8f34..a7b856ff5b0dcb1ab30de82a12c91a2e1c14fe76 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -15,8 +15,6 @@ from paddle.fluid.optimizer import Momentum, LarsMomentumOptimizer
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
-__all__ = ["LarsOptimizer"]
-
 
 class LarsOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index 12a4d904340337bf9a99968c7d82db117bf59ce8..073148e11a0a2b08253b89d36d7a014b830518f8 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ["MetaOptimizerBase"]
-
 from paddle.fluid.optimizer import Optimizer
 
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
similarity index 88%
rename from python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
index c0dee220aafd07bf69a198c6b03e6c957c50d4ce..878ed7422d733d3e2828e0395ec63ed16b4c489a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -13,12 +13,12 @@
 
 from paddle import fluid
 from paddle.fluid import compiler
-from .async_optimizer import AsyncMetaOptimizer
+from .parameter_server_optimizer import ParameterServerOptimizer
 
 
-class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer):
+class ParameterServerGraphOptimizer(ParameterServerOptimizer):
     def __init__(self, optimizer):
-        super(AsyncGraphExecutionOptimizer, self).__init__(optimizer)
+        super(ParameterServerGraphOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
@@ -31,6 +31,9 @@ class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer):
         if self.role_maker.is_server():
             return False
 
+        if self.role_maker._is_heter_parameter_server_mode:
+            return False
+
         return True
 
     def _disable_strategy(self, dist_strategy):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
similarity index 82%
rename from python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index b65435497284d279ebdea026e7ac88883a724c7c..ecb198bedf9041aa3ffc929a72cce3c209f03b61 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -15,9 +15,9 @@ from paddle import fluid
 from .meta_optimizer_base import MetaOptimizerBase
 
 
-class AsyncMetaOptimizer(MetaOptimizerBase):
+class ParameterServerOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
-        super(AsyncMetaOptimizer, self).__init__(optimizer)
+        super(ParameterServerOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
@@ -68,6 +68,21 @@ class AsyncMetaOptimizer(MetaOptimizerBase):
             _startup = worker.init_from_server_pass(_startup, compiled_config)
             _startup = worker.delet_extra_optimizes_pass(_startup,
                                                          compiled_config)
+
+            # for heter program
+            if self.role_maker._is_heter_parameter_server_mode:
+                from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
+                if self.role_maker._is_heter_worker():
+                    # for heter worker
+                    _main = heter_worker.split_heter_worker_ops_pass(
+                        _main, compiled_config)
+                else:
+                    # for default worker
+                    _main = heter_worker.split_trainer_ops_pass(_main,
+                                                                compiled_config)
+                # for startup change
+                _startup = heter_worker.delete_startup_useless_ops_var_pass(
+                    _startup, _main, compiled_config)
         else:
             _main = worker.append_send_ops_pass(_main, compiled_config)
             _startup = _startup
@@ -129,9 +144,12 @@ class AsyncMetaOptimizer(MetaOptimizerBase):
                                                      _origin_startup_program,
                                                      strategy, self.role_maker)
 
-        main_program, startup_program = \
-            self._build_trainer_programs(compiled_config) if self.role_maker.is_worker() \
-                else self._build_pserver_programs(compiled_config)
+        if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
+            main_program, startup_program = self._build_trainer_programs(
+                compiled_config)
+        elif self.role_maker.is_server():
+            main_program, startup_program = self._build_pserver_programs(
+                compiled_config)
 
         loss.block.program = main_program
         fluid.framework.switch_startup_program(startup_program)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index fe9221307cbacfa1beaf030b70a4e4b9223769cc..d5a45e2b4e1aeda2e1c66c0a5a36236622f093ec 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -20,8 +20,6 @@ from paddle.fluid.optimizer import PipelineOptimizer as PO
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
 
-__all__ = ["PipelineOptimizer"]
-
 
 class PipelineHelper(CollectiveHelper):
     def __init__(self, role_maker, nrings=1, wait_port='6174'):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 45130b447125f6ecbade2e4e5e3dad2f127fda52..3eb3ca6127cfe0d0a7a458c6c44e09ce22e7b24a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -14,8 +14,6 @@
 from paddle.fluid.optimizer import RecomputeOptimizer as RO
 from .meta_optimizer_base import MetaOptimizerBase
 
-__all__ = ["RecomputeOptimizer"]
-
 
 class RecomputeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/metrics/__init__.py b/python/paddle/distributed/fleet/metrics/__init__.py
index abf198b97e6e818e1fbe59006f98492640bcee54..bc30c063787d28e5bcb4455b3cbd56372879fe0a 100644
--- a/python/paddle/distributed/fleet/metrics/__init__.py
+++ b/python/paddle/distributed/fleet/metrics/__init__.py
@@ -11,3 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .metric import *
+
+__all__ = [
+    "sum",
+    "max",
+    "min",
+    "auc",
+    "mae",
+    "rmse",
+    "mse",
+    "acc",
+]
diff --git a/python/paddle/distributed/fleet/runtime/__init__.py b/python/paddle/distributed/fleet/runtime/__init__.py
index a796a73fc981b7edbcd57e8f5858456031e7ae6e..cf718b199e52e422ff8f2b66317f3cd6123c76a1 100644
--- a/python/paddle/distributed/fleet/runtime/__init__.py
+++ b/python/paddle/distributed/fleet/runtime/__init__.py
@@ -14,5 +14,3 @@
 
 from .collective_runtime import CollectiveRuntime
 from .parameter_server_runtime import ParameterServerRuntime
-
-__all__ = ["CollectiveRuntime," "ParameterServerRuntime", ]
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index c731ed08893348d0be604eb383905cd4a9d6e228..1741f10ccb1c28bfe6abaa63e754568fa08e21ce 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -196,6 +196,18 @@ class ParameterServerRuntime(RuntimeBase):
         else:
             warnings.warn("communicator has been initialized, skip")
 
+    def _get_executor(self):
+        if self.role_maker._is_heter_worker():
+            if self.role_maker._get_heter_worker_device() == "GPU":
+                gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+                executor = Executor(fluid.CUDAPlace(gpu_id))
+            else:
+                raise ValueError("Not Support Device {}".format(
+                    self.role_maker._get_heter_worker_device()))
+        else:
+            executor = fluid.Executor(fluid.CPUPlace())
+        return executor
+
     def _init_server(self, *args, **kwargs):
         if len(args) > 1:
             raise ValueError("init server can only accept 1 args: `dirname`")
@@ -204,9 +216,15 @@ class ParameterServerRuntime(RuntimeBase):
         else:
             model_dirname = None
 
-        executor = fluid.Executor(fluid.CPUPlace())
+        if self.role_maker._is_heter_worker():
+            self._init_worker()
+
+        executor = self._get_executor()
         executor.run(fluid.default_startup_program())
 
+        if self.role_maker._is_heter_worker():
+            return
+
         if not model_dirname:
             return
 
@@ -237,12 +255,12 @@ class ParameterServerRuntime(RuntimeBase):
         # self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames)
 
     def _run_server(self):
-        executor = fluid.Executor(fluid.CPUPlace())
+        executor = self._get_executor()
         executor.run(fluid.default_main_program())
 
     def _stop_worker(self):
         self._communicator.stop()
-        executor = fluid.Executor(fluid.CPUPlace())
+        executor = self._get_executor()
         executor.close()
 
     def _get_optimizer_status(self, op, param_name):
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index 212308159aabb123fde11543b3482f2232b4925d..f1911408c84a9dde56a8674e88e0fb8ad575cae7 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -15,4 +15,4 @@
 from .fs import *
 from .http_server import KVHandler, KVHTTPServer, KVServer
 
-__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
+#__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index ecd1cf0ca7bef6586e4833ce80c48eb08a6ad2ee..e2ab321f9aebddd437c92ded9e6005495f760096 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -44,11 +44,9 @@ import time
 import six
 import copy
 from argparse import ArgumentParser, REMAINDER
-import paddle
-import paddle.fluid as fluid
 
 from paddle.distributed.utils import *
-import paddle.distributed.cloud_utils as cloud_utils
+from paddle.distributed import cloud_utils
 
 
 def _print_arguments(args):
@@ -167,7 +165,8 @@ def get_cluster_from_args(args, selected_gpus):
 
 def get_gpus(selected_gpus):
     if selected_gpus is None:
-        gpus_num = fluid.core.get_cuda_device_count()
+        from paddle.fluid import core
+        gpus_num = core.get_cuda_device_count()
         selected_gpus = [str(x) for x in range(0, gpus_num)]
     else:
         cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
@@ -190,7 +189,7 @@ def get_gpus(selected_gpus):
     return selected_gpus
 
 
-def launch(args):
+def get_cluster_and_pod(args):
     # parse arguments, used for cloud-single-machine and local
     selected_gpus = get_gpus(args.selected_gpus)
     trainers_num = cloud_utils.get_trainers_num()
@@ -209,6 +208,12 @@ def launch(args):
         cluster, pod = get_cluster_from_args(args, selected_gpus)
         logger.info("get cluster from args:{}".format(cluster))
 
+    return cluster, pod
+
+
+def launch(args):
+    cluster, pod = get_cluster_and_pod(args)
+
     procs = start_local_trainers(
         cluster,
         pod,
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c806747217add5022b0c6ea66e184b44ef56836
--- /dev/null
+++ b/python/paddle/distributed/parallel.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except jin compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import six
+import warnings
+
+from paddle import compat as cpt
+
+# deprecated module import
+from paddle.fluid import core
+from paddle.fluid.framework import _set_expected_place
+from paddle.fluid.dygraph import parallel_helper
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+__all__ = ["init_parallel_env"]
+
+ParallelStrategy = core.ParallelStrategy
+
+
+def init_parallel_env(backend='nccl'):
+    """
+    Initialize parallel training environments in dynamic mode.
+
+    Args:
+        backend(str, optional): The backend to communication between multiple devices.
+            Now only support ``nccl`` . Default value is ``nccl`` .
+
+    Returns:
+        None
+        
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                dist.init_parallel_env()
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = paddle.DataParallel(layer)
+
+                loss_fn = nn.MSELoss()
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                dist.spawn(train)
+    """
+
+    # 1. input check
+    if not isinstance(backend, six.string_types):
+        raise TypeError("input `backend` type error, expected type is str, "
+                        "but received type is %s." % type(backend))
+    if cpt.to_text(backend) != 'nccl':
+        raise ValueError(
+            "backend `%s` is not supported, now only supports `nccl` backend." %
+            backend)
+
+    # 2. check env
+    def _check_var_exists(var_name):
+        var = os.environ.get(var_name, None)
+        if var is None:
+            raise ValueError("paddle.distributed initialize error, "
+                             "environment variable %s is needed, but not set." %
+                             var_name)
+
+    _check_var_exists("FLAGS_selected_gpus")
+    _check_var_exists("PADDLE_TRAINER_ID")
+    _check_var_exists("PADDLE_CURRENT_ENDPOINT")
+    _check_var_exists("PADDLE_TRAINERS_NUM")
+    _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
+
+    # 3. init ParallelStrategy
+    strategy = ParallelStrategy()
+    if cpt.to_text(backend) == 'nccl':
+        if parallel_helper._is_parallel_ctx_initialized():
+            warnings.warn("The parallel environment has been initialized.")
+        strategy.nranks = ParallelEnv().world_size
+        strategy.local_rank = ParallelEnv().rank
+        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+        strategy.current_endpoint = ParallelEnv().current_endpoint
+        if strategy.nranks < 2:
+            return
+        # NOTE(chenweihang): [ why config global place here? ]
+        # the dygraph mode will be set to default mode, 
+        # users will not call `dygraph.guard` or `enable_dygraph`
+        # directly, if they want to switch default place,
+        # they need to call a function to change default place,
+        # here just set correctly place to users
+        place = core.CUDAPlace(ParallelEnv().device_id)
+        _set_expected_place(place)
+
+        # init nccl context
+        parallel_helper._set_parallel_ctx(
+            core.NCCLParallelContext(strategy, place))
+        parallel_helper._init_parallel_ctx()
+
+
+def get_rank():
+    """
+    Returns the rank of current trainer.
+
+    Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . 
+    The default value is 0.
+
+    Returns:
+        (int) The rank of current trainer.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.distributed as dist
+
+            # execute this command in terminal: export PADDLE_TRAINER_ID=0
+            print("The rank is %d" % dist.get_rank())
+            # The rank is 0
+    """
+    return ParallelEnv().rank
+
+
+def get_world_size():
+    """
+    The number of trainers (number of processes participating in current job).
+
+    Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . 
+    The default value is 1.
+
+    Returns:
+        (int) The number of trainers.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.distributed as dist
+
+            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+            print("The world_size is %d" % dist.get_world_size())
+            # The world_size is 4
+    """
+    return ParallelEnv().world_size
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ca2ebaa8d4bd3e0f11e41cdcc35ab585a70b802
--- /dev/null
+++ b/python/paddle/distributed/spawn.py
@@ -0,0 +1,415 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import multiprocessing
+import os
+import signal
+import six
+import sys
+import warnings
+
+from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
+from paddle.distributed.utils import _prepare_trainer_env
+from paddle.device import get_device
+
+# deprecated module import
+from paddle.fluid import core
+from paddle.fluid.framework import _cpu_num
+
+
+# NOTE(chenweihang): The existence of this class leads to 
+# the maintenance of two arguments. When the launch.py arguments 
+# is updated, the arguments here also need to be updated, 
+# but I have not thought of a better way here
+class ParallelEnvArgs(object):
+    def __init__(self):
+        # Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..
+        self.cluster_node_ips = None
+
+        # The current node ip.
+        self.node_ip = None
+
+        # whether to use paddlecloud platform to run your multi-process job.
+        # If false, no need to set this argument.
+        self.use_paddlecloud = None
+
+        # The trainer's started port on a single node
+        self.started_port = None
+
+        # Print the config or not
+        self.print_config = True
+
+        # It's for gpu training and the training process will run 
+        # on the selected_gpus, each process is bound to a single GPU. 
+        # And if it's not set, this module will use all the gpu cards 
+        # for training.
+        self.selected_gpus = None
+
+
+def _py_supported_check():
+    if not sys.version_info >= (3, 4):
+        raise RuntimeError(
+            "Use `paddle.distributed.spawn` to start parallel training "
+            "requires python version greater than 3.4, if your python "
+            "is lower than this version, please use "
+            "`paddle.distributed.launch` instead.")
+
+
+def _get_subprocess_env_list(nprocs, options):
+    # contruct processes env list
+    processes_env_list = []
+
+    # get args from kwargs
+    args = ParallelEnvArgs()
+
+    # set default `node_ip` and `cluster_node_ips`
+    args.cluster_node_ips = options.get('cluster_node_ips', None)
+    args.node_ip = options.get('node_ip', None)
+    if args.cluster_node_ips is not None and args.node_ip is None:
+        raise ValueError("please input current node ip, "
+                         "cannot only give `cluster_node_ips`.")
+    default_node_ip = "127.0.0.1"
+    if args.node_ip is None:
+        args.node_ip = default_node_ip
+    if args.cluster_node_ips is None:
+        args.cluster_node_ips = default_node_ip
+
+    # set default selected gpus
+    # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3"
+    # NOTE(chenweihang): [ why not use FLAGS_selected_gpus directly? ]
+    # because the FLAGS_selected_gpus may be used in other place,
+    # if we set FLAGS_selected_gpus to be `0,1,2,3`, it may cause error
+    # when using `ParallelEnv`
+    # NOTE(chenweihang): use absolute gpu card id
+    args.selected_gpus = options.get('selected_gpus', None)
+    env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
+    if env_devices is None or env_devices == "":
+        env_devices_list = [
+            str(x) for x in six.moves.range(core.get_cuda_device_count())
+        ]
+    else:
+        env_devices_list = env_devices.split(',')
+    if args.selected_gpus is None:
+        if len(env_devices_list) < nprocs:
+            raise RuntimeError(
+                "the number of visible devices(%d) is less than the number "
+                "of spawn processes(%d), please ensure that the correct "
+                "`nprocs` argument is passed or the environment variable "
+                "`CUDA_VISIBLE_DEVICES` is correctly configured." %
+                (len(env_devices_list), nprocs))
+        args.selected_gpus = ",".join(
+            [str(env_devices_list[x]) for x in range(0, nprocs)])
+    else:
+        for card_id in args.selected_gpus.split(','):
+            if card_id not in env_devices_list:
+                raise ValueError("The selected gpu card %s cannot found in "
+                                 "CUDA_VISIBLE_DEVICES (%s)." %
+                                 (card_id, ",".join(env_devices_list)))
+
+    # set other arguments
+    args.started_port = options.get('started_port', None)
+    args.use_paddlecloud = options.get('use_paddlecloud', False)
+    args.print_config = options.get('print_config', False)
+
+    # reuse code of launch.py
+    cluster, pod = get_cluster_and_pod(args)
+
+    # prepare subprocess env list
+    for trainer in pod.trainers:
+        processes_env_list.append(_prepare_trainer_env(cluster, trainer))
+
+    # print config
+    if args.print_config:
+        _print_arguments(args)
+
+    return processes_env_list
+
+
+def _remove_risky_env():
+    # remove useless env vars, same as launch.py
+    # no copy, each process will hold env vars itself
+    os.environ.pop("http_proxy", None)
+    os.environ.pop("https_proxy", None)
+
+
+def _set_trainer_env(env_dict):
+    for var_name in env_dict:
+        os.environ[var_name] = env_dict[var_name]
+
+
+def _func_wrapper(func, args, error_queue, return_queue, env_dict):
+    try:
+        # config subprocess environment variables
+        _remove_risky_env()
+        _set_trainer_env(env_dict)
+        # execute function
+        result = func(*args)
+        # record function return value
+        return_queue.put(result)
+    except KeyboardInterrupt:
+        pass
+    except Exception:
+        import traceback
+        error_queue.put(traceback.format_exc())
+        sys.exit(1)
+
+
+class MultiprocessContext(object):
+    def __init__(self, processes, error_queues, return_queues):
+        _py_supported_check()
+        self.error_queues = error_queues
+        # NOTE(chenweihang): The `spawn` method is mainly used 
+        # to wrap the outermost execution function of the program for 
+        # parallel execution. Generally, the return value is not concerned, 
+        # but if the user needs to obtain the return value, users can get  
+        # the return result of each process from context.return_queues
+        self.return_queues = return_queues
+        self.processes = processes
+        self.sentinels = {
+            process.sentinel: index
+            for index, process in enumerate(processes)
+        }
+
+    def join(self, timeout=None):
+        if len(self.sentinels) == 0:
+            return True
+
+        ready = multiprocessing.connection.wait(
+            self.sentinels.keys(), timeout=timeout)
+
+        error_index = None
+        for sentinel in ready:
+            index = self.sentinels.pop(sentinel)
+            process = self.processes[index]
+            process.join()
+            if process.exitcode != 0:
+                error_index = index
+                break
+
+        if error_index is None:
+            return len(self.sentinels) == 0
+
+        for process in self.processes:
+            if process.is_alive():
+                process.terminate()
+            process.join()
+
+        self._throw_exception(error_index)
+
+    def _throw_exception(self, error_index):
+        if self.error_queues[error_index].empty():
+            exitcode = self.processes[error_index].exitcode
+            if exitcode < 0:
+                name = signal.Signals(-exitcode).name
+                raise Exception("Process %d terminated with signal %s." %
+                                (error_index, name))
+            else:
+                raise Exception("Process %d terminated with exit code %d." & (
+                    error_index, exitcode))
+
+        original_trace = self.error_queues[error_index].get()
+        msg = "\n\n----------------------------------------------\n" \
+              "Process %d terminated with the following error:\n" \
+              "----------------------------------------------\n\n" % error_index
+        msg += original_trace
+        raise Exception(msg)
+
+
+def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
+    """
+    Start multiple processes with ``spawn`` method for parallel training.
+
+    Args:
+        func (function): The target function is called by spawned process.
+            This function need to be able to pickled, so it must be defined
+            at the top level of a module.
+            This function should be called as ``func(i, *args)``, ``i`` is
+            the process index and ``args`` contains other arguments as tuple.
+        args (tuple, optional): Arguments passed to ``func``.
+        nprocs (int, optional): Number of processed to start. Default: -1.
+            when nprocs is -1, the available device will be obtained from 
+            the environment variable when the model is executed: If use GPU, 
+            the currently available device ID is obtained from the environment 
+            variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available
+            CPU number is obtained from the environment variable CPU_NUM. 
+            For example, export CPU_NUM=4, if the environment variable is not set, 
+            the executor will add the variable to the environment variable and 
+            set its value to 1.
+        join (bool, optional): Perform a blocking join on all spawned processes.
+            Default: True.
+        daemon (bool, optional): The spawned processes' daemon flag. Default: False.
+        **options(dict, optional): Other initial parallel execution environment 
+            configuration options. The following options are currently supported: 
+            (1) start_method (string): the way to start a process. 
+            The start method can be ``spawn`` , ``fork`` , ``forkserver`` . 
+            Because the CUDA runtime does not support the ``fork`` start method, 
+            when use CUDA in subprocesses, we should start process by ``spawn`` 
+            or ``forkserver`` method. Default: "spawn" ; 
+            (2) cluster_node_ips (string): Paddle cluster nodes ips, such as 
+            "192.168.0.16,192.168.0.17". Default: "127.0.0.1"; 
+            (3) node_ip (string): The current node ip, such as "192.168.0.16". 
+            Default: "127.0.0.1"; 
+            (4) started_port (int): The trainer's started port on a single node,
+            such as 6170. Default: None; 
+            (5) selected_gpus (string): The training process will run on the 
+            selected_gpus, such as "0,1,2,3". Default: None; 
+            (6) print_config: Print current parallel training config. Default: False;
+            (7) use_paddlecloud: Whether to use paddlecloud platform to run your 
+            multi-process job. Default: False.
+
+    Returns:
+        ``MultiprocessContext`` object, it hold the spawned processes.
+
+    Examples:
+        .. code-block:: python
+
+            from __future__ import print_function
+
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train(print_result=False):
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                dist.init_parallel_env()
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = paddle.DataParallel(layer)
+
+                loss_fn = nn.MSELoss()
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                if print_result is True:
+                    print("loss:", loss.numpy())
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            # Usage 1: only pass function. 
+            # If your training method no need any argument, and 
+            # use all visible devices for parallel training. 
+            if __name__ == '__main__':
+                dist.spawn(train)
+
+            # Usage 2: pass function and arguments.
+            # If your training method need some arguments, and 
+            # use all visible devices for parallel training.
+            if __name__ == '__main__':
+                dist.spawn(train, args=(True,))
+
+            # Usage 3: pass function, arguments and nprocs.
+            # If your training method need some arguments, and 
+            # only use part of visible devices for parallel training.
+            # If your machine hold 8 cards {0,1,2,3,4,5,6,7},
+            # this case will use cards {0,1}; If you set 
+            # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use
+            # cards {4,5}
+            if __name__ == '__main__':
+                dist.spawn(train, args=(True,), nprocs=2)
+
+            # Usage 4: pass function, arguments, nprocs and selected_gpus.
+            # If your training method need some arguments, and 
+            # only use part of visible devices for parallel training,
+            # but you can't set your machine's environment varibale 
+            # CUDA_VISIBLE_DEVICES, such as it is None or all cards
+            # {0,1,2,3,4,5,6,7}, you can pass `selelcted_gpus` to 
+            # select the GPU cards you want to use. For example,
+            # this case will use cards {4,5} if your machine hold 8 cards.
+            if __name__ == '__main__':
+                dist.spawn(train, args=(True,), nprocs=2, selelcted_gpus='4,5')
+    """
+    # NOTE(chenweihang): [ why only supports python3.4+ ? ]
+    # Python supported setting the child process startup method
+    # since 3.4. The previous version can only use the default startup 
+    # method, while the default startup method of Unix is fork, which 
+    # cannot support CUDA runtime multi-process
+    _py_supported_check()
+
+    # get default nprocs
+    if nprocs == -1:
+        device = get_device()
+        if device == 'cpu':
+            # TODO: not supports cpu parallel now
+            nprocs = _cpu_num
+        else:
+            nprocs = core.get_cuda_device_count()
+
+    # NOTE(chenweihang): [ why need get cluster info before run? ]
+    # when using `paddle.distributed.spawn` start parallel training, 
+    # we should get cluster info before starting subprocess, and pass 
+    # correct info to each subprocess
+    procs_env_list = _get_subprocess_env_list(nprocs, options)
+
+    # start processes
+    # NOTE(chenweihang): [ why default start method is spawn? ]
+    # The CUDA runtime does not support the fork start method, 
+    # either the spawn or forkserver start method are required 
+    # to use CUDA in subprocesses.
+    start_method = options.get('start_method', None)
+    if start_method is None:
+        start_method = 'spawn'
+    mp = multiprocessing.get_context(start_method)
+
+    error_queues = []
+    return_queues = []
+    processes = []
+    for i in range(nprocs):
+        error_queue = mp.SimpleQueue()
+        return_queue = mp.SimpleQueue()
+        process = mp.Process(
+            target=_func_wrapper,
+            args=(func, args, error_queue, return_queue, procs_env_list[i]))
+        process.daemon = daemon
+        process.start()
+        error_queues.append(error_queue)
+        return_queues.append(return_queue)
+        processes.append(process)
+
+    context = MultiprocessContext(processes, error_queues, return_queues)
+    if not join:
+        return context
+
+    # loop until all process end
+    while not context.join():
+        pass
+
+    # finally return context
+    return context
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 7c8fa257f778e71cab35054c3f9d63faaa33de47..1fa307c4d1b89d4033a8f8346b254177053e9dc0 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -327,6 +327,17 @@ def find_free_ports(num):
     return None
 
 
+def _prepare_trainer_env(cluster, trainer):
+    proc_env = {
+        "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
+        "PADDLE_TRAINER_ID": "%d" % trainer.rank,
+        "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+        "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+        "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+    }
+    return proc_env
+
+
 class TrainerProc(object):
     def __init__(self):
         self.proc = None
@@ -352,14 +363,7 @@ def start_local_trainers(cluster,
 
     procs = []
     for idx, t in enumerate(pod.trainers):
-        proc_env = {
-            "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
-            "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
-            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
-        }
-
+        proc_env = _prepare_trainer_env(cluster, t)
         current_env.update(proc_env)
 
         logger.debug("trainer proc env:{}".format(current_env))
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index acb7251a15dbfff7e032079c2be2e973e81aa5f4..49e98805d24f3f8f5dc1cfcbf3ddc8d9fb835fde 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -243,10 +243,19 @@ class Uniform(Distribution):
             zero_tmp = tensor.fill_constant_batch_size_like(
                 self.low + self.high, batch_shape + shape, self.low.dtype, 0.)
             uniform_random_tmp = nn.uniform_random_batch_size_like(
-                zero_tmp, zero_tmp.shape, min=0., max=1., seed=seed)
-            output = uniform_random_tmp * (zero_tmp + self.high - self.low
-                                           ) + self.low
-            return nn.reshape(output, output_shape, name=name)
+                zero_tmp,
+                zero_tmp.shape,
+                dtype=convert_dtype(zero_tmp.dtype),
+                min=0.,
+                max=1.,
+                seed=seed)
+            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
+            uniform_random_tmp_reshape = nn.reshape(uniform_random_tmp,
+                                                    output_shape)
+            output = uniform_random_tmp_reshape * (
+                zero_tmp_reshape + self.high - self.low)
+            output = elementwise_add(output, self.low, name=name)
+            return output
         else:
             output_shape = shape + batch_shape
             output = nn.uniform_random(
@@ -446,11 +455,17 @@ class Normal(Distribution):
             output_shape = shape + batch_shape
             zero_tmp = tensor.fill_constant_batch_size_like(
                 self.loc + self.scale, batch_shape + shape, self.loc.dtype, 0.)
-            zero_tmp_shape = nn.shape(zero_tmp)
+            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
+            zero_tmp_shape = nn.shape(zero_tmp_reshape)
             normal_random_tmp = nn.gaussian_random(
-                zero_tmp_shape, mean=0., std=1., seed=seed)
-            output = normal_random_tmp * (zero_tmp + self.scale) + self.loc
-            return nn.reshape(output, output_shape, name=name)
+                zero_tmp_shape,
+                mean=0.,
+                std=1.,
+                seed=seed,
+                dtype=convert_dtype(self.loc.dtype))
+            output = normal_random_tmp * (zero_tmp_reshape + self.scale)
+            output = elementwise_add(output, self.loc, name=name)
+            return output
         else:
             output_shape = shape + batch_shape
             output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed) * \
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 2ed8642c86d95bf049920d281f4063da9779623e..9f748b7956f9faa6b1c948d87f0ef4659057a421 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -196,6 +196,7 @@ def __bootstrap__():
         'free_idle_chunk',
         'free_when_no_cache_hit',
         'call_stack_level',
+        'sort_sum_gradient',
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index cece2ba4a3d788ab2df4c0a6a847c9597d36047a..e3755cbafea41e61352f67c3de040e700297b61a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -15,6 +15,7 @@
 import logging
 import numpy as np
 import sys
+import paddle
 from paddle.fluid import dygraph
 from paddle.fluid.dygraph.nn import Conv2D
 from paddle.fluid.dygraph.nn import Linear
@@ -195,13 +196,16 @@ class ImperativeQuantAware(object):
         with dygraph.guard():
             model.eval()
             input_vars = []
-            for shape, dtype in zip(input_shape, input_dtype):
-                raw_data = np.random.random(shape)
-                input_data = raw_data[np.newaxis, :].astype(
-                    dtype) if append_batch_size else raw_data.astype(dtype)
-                input_var = dygraph.to_variable(input_data)
-                input_vars.append(input_var)
-            outputs = prog_trans.get_output(model.forward, model, *input_vars)
+            for i, (shape, dtype) in enumerate(zip(input_shape, input_dtype)):
+                if append_batch_size:
+                    shape = [None] + list(shape)
+                # Note(Aurelius84): need a elegant way to name this.
+                in_spec = paddle.static.InputSpec(shape, dtype, 'feed_%d' % i)
+                input_vars.append(in_spec)
+            # use `declarative` to convert dygraph into static program
+            model.forward = dygraph.jit.declarative(
+                model.forward, input_spec=input_vars)
+            outputs = model.forward.concrete_program.outputs
         input_spec = [input_vars[i] for i in feed]
         configs = dygraph.jit.SaveLoadConfig()
         configs.separate_params = True
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 14d1114a8f64a1238596fb1050e2cdb6e31ec6b0..b5a8d901943318ca039b0a73c1be39fb0734e212 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -68,6 +68,7 @@ _out_scale_op_list = [
     "scale",
     "hard_swish",
     "hard_sigmoid",
+    "conv2d_transpose",
 ]
 
 # list op real input and output names, to avoid processing input such as AxisTensor.
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index f0943b72add6a7a37b5da45853bdddc8c5224a3e..007d701284dfc7ff2cafb128984414517579fce3 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -271,6 +271,6 @@ endforeach()
 
 # setting timeout value for old unittests
 if(NOT WIN32)
-	set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250)
-	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250 LABELS "RUN_TYPE=NIGHTLY")
+	  set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
 endif()
diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index 2b331308de5ee9a8aa52a9e303bfbcf8d4264d5f..a5f08ca969ac43f47899395aeb588ddaf2f1e394 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -92,9 +92,11 @@ class TestWeightDecay(unittest.TestCase):
         return param_sum
 
     def check_weight_decay(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
+
         with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
                 name="words", shape=[1], dtype="int64", lod_level=1)
@@ -113,9 +115,11 @@ class TestWeightDecay(unittest.TestCase):
         return param_sum
 
     def check_weight_decay2(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
+
         with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
                 name="words", shape=[1], dtype="int64", lod_level=1)
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index a81d73d7e9a621d2a02ed91541f32b827bdff38c..6a996493e4df1e1facc6ccd205a8ae5105f92c5b 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -30,7 +30,8 @@ if six.PY2:
 else:
     import queue
 
-from .. import core
+import paddle
+from .. import core, layers
 from ..framework import in_dygraph_mode
 from ..multiprocess_utils import CleanupFuncRegistrar, _cleanup_mmap, _set_SIGCHLD_handler
 from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
@@ -79,7 +80,13 @@ def default_collate_fn(batch):
                 slots.append([item])
             else:
                 slots[i].append(item)
-    return [np.stack(slot, axis=0) for slot in slots]
+
+    if isinstance(slots[0][0], np.ndarray):
+        return [np.stack(slot, axis=0) for slot in slots]
+    elif isinstance(slots[0][0], paddle.Tensor):
+        return [layers.stack(slot, axis=0) for slot in slots]
+    else:
+        raise RuntimeError("Unknown data type {}".format(type(slots[0][0])))
 
 
 class _DatasetKind(object):
@@ -284,6 +291,12 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
                 for slot in batch:
                     if not isinstance(slot, core.LoDTensor):
                         self._check_input_array(slot)
+                        # FIXME(dkp): blocking_queue only support
+                        #             core.LoDTensorArray as input now, read
+                        #             numpy data into a LoDTensorArray here,
+                        #             should support paddle.Tensor list later
+                        if isinstance(slot, paddle.Tensor):
+                            slot = slot.numpy()
                         tmp = core.LoDTensor()
                         tmp.set(slot, core.CPUPlace())
                         slot = tmp
@@ -305,6 +318,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
 
     @classmethod
     def _check_input_array(cls, item):
+        if isinstance(item, paddle.Tensor):
+            return
         arr = np.array(item)
         if arr.dtype == np.object:
             raise TypeError((
@@ -530,6 +545,14 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                         out_queue.put((idx, e))
                 else:
                     if self._use_shared_memory:
+                        # FIXME(dkp): _convert_to_tensor_list only support np.array
+                        #             list now, should support paddle.Tensor list
+                        if isinstance(batch[0][0], paddle.Tensor):
+                            np_batch = []
+                            for sample in batch:
+                                np_batch.append([s.numpy() for s in sample])
+                            batch = np_batch
+
                         tensor_list = core._convert_to_tensor_list(batch)
                         out_queue.put((idx, tensor_list))
                         core._remove_tensor_list_mmap_fds(tensor_list)
@@ -585,22 +608,24 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             # in _send_idx but will not increase _rcvd_idx, so we check 
             # whether the worker is still alive here to skip the discarded
             # batch indices and increase _rcvd_idx
-            while self._rcvd_idx < self._send_idx:
-                info = self._task_infos[self._rcvd_idx]
-                if len(info) == 2 or self._worker_status[info[0]]:
-                    break
-                del self._task_infos[self._rcvd_idx]
-                self._rcvd_idx += 1
-                self._batches_outstanding -= 1
-            else:
-                # NOTE: _rcvd_idx and _send_idx only record batches among
-                #       workers, if batches among workers drained, there
-                #       may also be data in blocking queue
-                if self._batches_outstanding < len(self._places):
-                    return None
-                continue
-
-            if len(self._task_infos[self._rcvd_idx]) == 2:
+            if self._dataset_kind == _DatasetKind.ITER:
+                while self._rcvd_idx < self._send_idx:
+                    info = self._task_infos[self._rcvd_idx]
+                    if len(info) == 2 or self._worker_status[info[0]]:
+                        break
+                    del self._task_infos[self._rcvd_idx]
+                    self._rcvd_idx += 1
+                    self._batches_outstanding -= 1
+                else:
+                    # NOTE: _rcvd_idx and _send_idx only record batches among
+                    #       workers, if batches among workers drained, there
+                    #       may also be data in blocking queue
+                    if self._batches_outstanding < len(self._places):
+                        return None
+                    continue
+
+            if self._rcvd_idx in self._task_infos and \
+                    len(self._task_infos[self._rcvd_idx]) == 2:
                 return self._task_infos.pop(self._rcvd_idx)[1]
 
             try:
diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py
index e47f57381c0decaf365df58e96d6386a01f9ef1e..13bb946a5ebca09686fc7f56b2f7c5b068ea3148 100644
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -14,9 +14,10 @@
 
 from __future__ import print_function
 
+from .. import framework
 import paddle.dataset.common
 
-__all__ = ["Dataset", "IterableDataset"]
+__all__ = ["Dataset", "IterableDataset", "TensorDataset"]
 
 
 class Dataset(object):
@@ -222,3 +223,55 @@ class IterableDataset(Dataset):
     def __len__(self):
         raise RuntimeError("'{}' should not be called for IterableDataset" \
                 "{}".format('__len__', self.__class__.__name__))
+
+
+class TensorDataset(Dataset):
+    """
+    Dataset defined by a list of tensors.
+
+    Each tensor should be in shape of [N, ...], while N is the sample number,
+    and ecah tensor contains a field of sample, :code:`TensorDataset` retrieve
+    each sample by indexing tensors in the 1st dimension.
+
+    Args:
+        tensors(list of Tensor): tensors with same shape in the 1st dimension.
+
+    Returns:
+        Dataset: a Dataset instance wrapping tensors.
+
+    Examples:
+
+        .. code-block:: python
+        
+            import numpy as np
+            import paddle
+            from paddle.io import TensorDataset
+
+            paddle.disable_static()
+
+            input_np = np.random.random([2, 3, 4]).astype('float32')
+            input = paddle.to_tensor(input_np)
+            label_np = np.random.random([2, 1]).astype('int32')
+            label = paddle.to_tensor(label_np)
+
+            dataset = TensorDataset([input, label])
+
+            for i in range(len(dataset)):
+                input, label = dataset[i]
+                print(input, label)
+
+    """
+
+    def __init__(self, tensors):
+        if not framework.in_dygraph_mode():
+            raise RuntimeError(
+                "TensorDataset con only be used in imperative mode")
+        assert all([tensor.shape[0] == tensors[0].shape[0] for tensor in tensors]), \
+                "tensors not have same shape of the 1st dimension"
+        self.tensors = tensors
+
+    def __getitem__(self, index):
+        return tuple(tensor[index] for tensor in self.tensors)
+
+    def __len__(self):
+        return self.tensors[0].shape[0]
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index fc14e9b390e6ae4d695252f064f1f0697aaee258..cf270ced3b704179856b1ab04dbeae8a04fbc589 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -38,9 +38,6 @@ from .checkpoint import *
 from . import learning_rate_scheduler
 from .learning_rate_scheduler import *
 
-from . import backward_strategy
-from .backward_strategy import *
-
 from . import jit
 from .jit import *
 
@@ -69,7 +66,6 @@ __all__ += nn.__all__
 __all__ += parallel.__all__
 __all__ += checkpoint.__all__
 __all__ += learning_rate_scheduler.__all__
-__all__ += backward_strategy.__all__
 __all__ += jit.__all__
 __all__ += io.__all__
 __all__ += rnn.__all__
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index d4f1ca333945d8933a7a9df7ca93ea825e5cf110..2174dbd31b8fb1ae97894699e03e25e809085cc8 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -319,8 +319,7 @@ def grad(outputs,
          create_graph=False,
          only_inputs=True,
          allow_unused=False,
-         no_grad_vars=None,
-         backward_strategy=None):
+         no_grad_vars=None):
     ''' 
     .. note::
         **This API is ONLY available in Dygraph mode.**
@@ -328,19 +327,19 @@ def grad(outputs,
     This API computes the sum of gradients of `outputs` with respect to each `inputs` .
 
     Parameters:
-        outputs (Variable|list(Variable)|tuple(Variable)): the output Variable or 
-            Variable list/tuple of the graph to compute gradients.
-        inputs (Variable|list(Variable)|tuple(Variable)): the input Variable or 
-            Variable list/tuple of the graph to compute gradients. The returned
+        outputs (Tensor|list(Tensor)|tuple(Tensor)): the output Tensor or 
+            Tensor list/tuple of the graph to compute gradients.
+        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
+            Tensor list/tuple of the graph to compute gradients. The returned
             values of this API are the gradients of `inputs` . 
-        grad_outputs (Variable|list(Variable|None)|tuple(Variable|None), optional): 
+        grad_outputs (Tensor|list(Tensor|None)|tuple(Tensor|None), optional): 
             initial gradient values of `outputs` . If `grad_outputs` is None, 
             the initial gradient values of `outputs` would be Tensors filled with 1; 
             if `grad_outputs` is not None, it must have the same length as `outputs` , 
             and in this case, the initial gradient value of the i-th `outputs` would
             be: (1) a Tensor filled with 1 when the i-th element of `grad_outputs` 
             is None; (2) the i-th element of `grad_outputs` when the i-th element of
-            `grad_outputs` is a Variable. Default None.
+            `grad_outputs` is a Tensor. Default None.
         retain_graph (bool, optional): whether to retain the forward graph which 
             is used to calculate the gradient. When it is True, the graph would 
             be retained, in which way users can calculate backward twice for the 
@@ -352,24 +351,21 @@ def grad(outputs,
             computing process would be discarded. Default False.
         only_inputs (bool, optional): whether to only compute the gradients of
             `inputs` . If it is False, the gradients of all remaining leaf 
-            Variables in the graph would be also computed and accumulated. 
+            Tensors in the graph would be also computed and accumulated. 
             If it is True, only the gradients of `inputs` would be computed.
             Default True. only_inputs=False is under development, and it is
             not supported yet.    
         allow_unused (bool, optional): whether to raise error or return None if some 
-            Variables of `inputs` are unreachable in the graph. If some Variables of 
+            Tensors of `inputs` are unreachable in the graph. If some Tensors of 
             `inputs` are unreachable in the graph (i.e., their gradients are None),  
             error would be raised if allow_unused=False, or None would be returned as
             their gradients if allow_unused=True. Default False.
-        no_grad_vars (Variable|list(Variable)|tuple(Variable)|set(Variable), optional): 
-            the Variables whose gradients are not needed to compute. Default None.
-        backward_strategy (BackwardStrategy, optional): The backward strategy to
-            compute gradients. See :ref:`api_fluid_dygraph_BackwardStrategy` for
-            details. Default None.
+        no_grad_vars (Tensor|list(Tensor)|tuple(Tensor)|set(Tensor), optional): 
+            the Tensors whose gradients are not needed to compute. Default None.
 
     Returns:
-        tuple: a tuple of Variables, whose length is the same as the Variable number 
-        inside `inputs`, and the i-th returned Variable is the sum of gradients of 
+        tuple: a tuple of Tensors, whose length is the same as the Tensor number 
+        inside `inputs`, and the i-th returned Tensor is the sum of gradients of 
         `outputs` with respect to the i-th `inputs`.
 
     Examples 1:
@@ -503,12 +499,6 @@ def grad(outputs,
         raise AssertionError(
             "no_grad_vars must be None, Variable or list/tuple/set of Variables")
 
-    if backward_strategy is None:
-        backward_strategy = core.BackwardStrategy()
-
-    assert isinstance(backward_strategy, core.BackwardStrategy), \
-        "backward_strategy must be type paddle.fluid.dygraph.BackwardStrategy"
-
     assert isinstance(create_graph, bool), "create_graph must be True or False"
 
     if retain_graph is None:
@@ -524,9 +514,9 @@ def grad(outputs,
 
     place = core.Place()
     place.set_place(framework._current_expected_place())
-    return core.dygraph_partial_grad(
-        inputs, outputs, grad_outputs, no_grad_vars, place, backward_strategy,
-        create_graph, retain_graph, allow_unused, only_inputs)
+    return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
+                                     no_grad_vars, place, create_graph,
+                                     retain_graph, allow_unused, only_inputs)
 
 
 @framework.dygraph_only
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py b/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py
index 1f91027e462d3437b0ef01455aa037cb38d8b658..9608910ee8d6223ea8e7bab06d5db90632cc2be0 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/__init__.py
@@ -34,6 +34,9 @@ from .convert_call_func import *
 
 from . import convert_operators
 
+from . import logging_utils
+from .logging_utils import *
+
 __all__ = []
 __all__ += ast_transformer.__all__
 __all__ += loop_transformer.__all__
@@ -41,3 +44,4 @@ __all__ += static_analysis.__all__
 __all__ += variable_trans_func.__all__
 __all__ += program_translator.__all__
 __all__ += convert_call_func.__all__
+__all__ += logging_utils.__all__
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index 8297f16f6081829029064600ad617044591bc256..5152799ca72f1461d6fbfc3a619a6aa9b9477934 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 # as produced by ast.parse from the standard ast module.
 # See details in https://github.com/serge-sans-paille/gast/
 import gast
-
 from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer
 from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer
 from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakContinueTransformer
@@ -31,9 +30,11 @@ from paddle.fluid.dygraph.dygraph_to_static.logical_transformer import LogicalTr
 from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import LoopTransformer
 from paddle.fluid.dygraph.dygraph_to_static.print_transformer import PrintTransformer
 from paddle.fluid.dygraph.dygraph_to_static.return_transformer import ReturnTransformer
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
 from paddle.fluid.dygraph.dygraph_to_static.tensor_shape_transformer import TensorShapeTransformer
 
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import get_attribute_full_name
 
 __all__ = ['DygraphToStaticAst']
@@ -57,45 +58,70 @@ class DygraphToStaticAst(gast.NodeTransformer):
         return self.static_analysis_root
 
     def transfer_from_node_type(self, node_wrapper):
+        translator_logger = logging_utils.TranslatorLogger()
+        translator_logger.log(
+            1, "   Source code: \n{}".format(ast_to_source_code(self.root)))
         # Generic transformation
         self.visit(node_wrapper.node)
 
         # Transform basic api of dygraph to static graph and get feed_name_to_arg_name
-        basic_api_trans = BasicApiTransformer(node_wrapper)
-        basic_api_trans.transform()
+        BasicApiTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(1, self.root,
+                                               "BasicApiTransformer")
 
         # Transform Tensor.shape into fluid.layers.shape(Tensor)
         TensorShapeTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(2, self.root,
+                                               "TensorShapeTransformer")
 
         # Transform list used in control flow
         ListTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(3, self.root, "ListTransformer")
 
         # Transform break/continue in loops
         BreakContinueTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(4, self.root,
+                                               "BreakContinueTransformer")
 
         # Transform return in functions
         ReturnTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(5, self.root,
+                                               "ReturnTransformer")
 
         # Transform logical and/or/not
         LogicalTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(6, self.root,
+                                               "LogicalTransformer")
 
         # Transform for loop and while loop
         LoopTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(7, self.root, "LoopTransformer")
 
         # Transform all if/else statement of Dygraph into Static Graph.
         IfElseTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(8, self.root,
+                                               "IfElseTransformer")
 
         # Transform python assert statement
         AssertTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(9, self.root,
+                                               "AssertTransformer")
 
         # Transform all python print statement
         PrintTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(10, self.root,
+                                               "PrintTransformer")
 
         # Transform call recursively
         CallTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(11, self.root, "CallTransformer")
 
         # Transform python type casting statement
         CastTransformer(node_wrapper).transform()
+        translator_logger.log_transformed_code(12, self.root, "CastTransformer")
+
+        translator_logger.log_transformed_code(logging_utils.LOG_AllTransformer,
+                                               self.root, "All Transformers")
 
     def visit_FunctionDef(self, node):
         if self.decorate_func_name is None:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
index 4ba1d302576df695c5b2e867452b91b3d1d2844a..7fc72d42759b0f8029ac6adfc7b9670fbffc67d5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
@@ -19,6 +19,8 @@ from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrappe
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import is_paddle_api
 
+PDB_SET = "pdb.set_trace"
+
 
 class CallTransformer(gast.NodeTransformer):
     """
@@ -62,6 +64,12 @@ class CallTransformer(gast.NodeTransformer):
             return node
 
         func_str = ast_to_source_code(node.func).strip()
+
+        # NOTE(liym27): Don't convert `pad.set_trace` even if the convertion doesn't work finally, because
+        # it is clearer to see where it is called from.
+        if PDB_SET in func_str:
+            return node
+
         new_func_str = "fluid.dygraph.dygraph_to_static.convert_call({})".format(
             func_str)
         new_func_ast = gast.parse(new_func_str).body[0].value
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index edd7dfcf93977bb3244c0c1676715a65dba88dc2..4630cfcdabfd307ea03a7fd0c885c73ce4a4ea0b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -27,13 +27,16 @@ import types
 import numpy
 import six
 
-from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
-from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len
+from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticLayer
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static
+from paddle.fluid.dygraph.layers import Layer
 
-DECORATOR_NAMES = ['declarative', 'dygraph_to_static_func']
-program_translator = ProgramTranslator()
-to_static_func = program_translator.get_func
+# TODO(liym27): A better way to do this.
+BUILTIN_LIKELY_MODULES = [collections, pdb, copy, inspect, re, six, numpy]
+
+translator_logger = TranslatorLogger()
 
 
 def is_builtin(func):
@@ -41,11 +44,6 @@ def is_builtin(func):
         return True
     elif func in six.moves.builtins.__dict__.values():
         return True
-    # Other built-in modules
-    # TODO(liym27): A better way to do this.
-    elif any(func in m.__dict__.values()
-             for m in (collections, pdb, copy, inspect, re, six, numpy)):
-        return True
     else:
         return False
 
@@ -61,9 +59,29 @@ def is_paddle_func(func):
     return m is not None and m.__name__.startswith("paddle")
 
 
+def is_unsupported(func):
+    """
+    Checks whether the func is supported by dygraph to static graph.
+    """
+
+    if any(func in m.__dict__.values() for m in BUILTIN_LIKELY_MODULES):
+        translator_logger.log(
+            2,
+            "Whitelist: {} is part of built-in module and does not have to be transformed.".
+            format(func))
+        return True
+
+    if is_paddle_func(func):
+        translator_logger.log(
+            2,
+            "Whitelist: {} is part of Paddle module and does not have to be transformed.".
+            format(func))
+        return True
+
+
 def convert_call(func):
     """
-    Converts a function call which needs to be transformed to static fucntion.
+    Converts a function call which needs to be transformed to static function.
 
     Args:
         func (callable): A callable function or method to convert.
@@ -95,13 +113,24 @@ def convert_call(func):
           #  [1. 1. 1.]]
 
     """
+    translator_logger.log(1,
+                          "Convert callable object: convert {}.".format(func))
     func_self = None
     converted_call = None
 
+    # Function in convert_call may be decorated by another `@declarative`,
+    # in this case, unwraps it into a raw method or function.
+    if isinstance(func, StaticLayer):
+        instance = func._class_instance
+        if instance is not None:
+            func = func.dygraph_function.__get__(instance)
+        else:
+            func = func.dygraph_function
+
     if is_builtin_len(func):
         return convert_len
 
-    if is_builtin(func) or is_paddle_func(func):
+    if is_builtin(func) or is_unsupported(func):
         return func
 
     if inspect.isfunction(func):
@@ -109,12 +138,36 @@ def convert_call(func):
         if func.__name__ == '<lambda>':
             return func
         try:
-            global_funcs = set([
-                fn for fn in func.__globals__.values() if inspect.isfunction(fn)
-            ])
-            if func in global_funcs:
-                converted_call = to_static_func(func)
+            # Note(Aurelius84): Because `@declarative` returns a class instance instead of
+            # a function. This will modify the value referring to itself in `__globals__`.
+
+            # For example: 
+            #
+            #      @declarative
+            #      def foo(x):
+            #          return x
+            #
+            # `foo` will be converted into a wrapper class, suppose as `StaticLayer`.
+            # And `foo.__globals__['foo']` will still return this `StaticLayer` instead of
+            # `foo` function. So `isinstance(fn, StaticLayer)` is added here. 
+            global_functions = set()
+            for fn in func.__globals__.values():
+                if inspect.isfunction(fn):
+                    global_functions.add(fn)
+                elif isinstance(fn, StaticLayer):
+                    global_functions.add(fn.dygraph_function)
+
+            if func in global_functions:
+                converted_call = convert_to_static(func)
                 func_self = getattr(func, '__self__', None)
+            else:
+                # NOTE:
+                # If func is not in __globals__, it does not need to be transformed
+                # because it has been transformed before.
+                translator_logger.warn(
+                    "{} doesn't have to be transformed to static function because it has been transformed before, it will be run as-is."
+                    .format(func))
+                converted_call = func
         except AttributeError:
             # NOTE:
             # If func is not in __globals__, it does not need to be transformed
@@ -127,7 +180,7 @@ def convert_call(func):
             converted_call = None
     elif inspect.ismethod(func):
         try:
-            converted_call = to_static_func(func)
+            converted_call = convert_to_static(func)
             func_self = getattr(func, '__self__', None)
         except (IOError, OSError):
             # NOTE: func may have been decorated.
@@ -136,7 +189,7 @@ def convert_call(func):
     elif hasattr(func, '__class__') and hasattr(func.__class__, '__call__'):
         if hasattr(func, 'forward') and isinstance(func, Layer):
             try:
-                forward_func = to_static_func(func.forward)
+                forward_func = convert_to_static(func.forward)
                 setattr(func, 'forward', forward_func)
                 func_self = func
             except Exception:
@@ -146,15 +199,21 @@ def convert_call(func):
         else:
             try:
                 call_func = func.__class__.__call__
-                converted_call = to_static_func(call_func)
+                converted_call = convert_to_static(call_func)
                 func_self = func
             except Exception:
                 # NOTE:
                 # If `func` is a class which is being initialized, for example `convert_call(Foo)()`,
                 # it doesn't need to be transformed
                 func_self = None if func_self else func_self
+    else:
+        raise NotImplementedError(
+            "Callable {} can not be transformed at present.".format(func))
 
     if converted_call is None:
+        translator_logger.warn(
+            "{} doesn't have to be transformed to static function, and it will be run as-is."
+            .format(func))
         return func
 
     if func_self:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..5540c63a85bd7f8db760f0c3e25be9eefa2aace7
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import six
+import inspect
+import numpy as np
+import collections
+import paddle
+from paddle.fluid import core
+from paddle.fluid.dygraph import layers
+from paddle.fluid.layers.utils import flatten
+from paddle.fluid.layers.utils import pack_sequence_as
+from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
+from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
+from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
+
+
+class FunctionSpec(object):
+    """
+    Wrapper class for a function for class method.
+    """
+
+    def __init__(self, function, input_spec=None):
+        self._dygraph_function = function
+        if input_spec is None:
+            self._input_spec = None
+            self._flat_input_spec = None
+        else:
+            self._input_spec = self._verify_input_spec(input_spec)
+            self._flat_input_spec = flatten(self._input_spec)
+
+        # parse full argument names list.
+        self._arg_names, self._default_kwargs = parse_arg_and_kwargs(function)
+
+    def unified_args_and_kwargs(self, args, kwargs):
+        """
+        Moves kwargs with default value into arguments list to keep `args` contain the same length
+        value as function definition.
+        
+        For example: 
+        
+            Given function definition: `def foo(x, a=1, b=2)`, 
+            when calling it by `foo(23)`, the args is `[23]`, kwargs is `{a=1, b=2}`.
+            In this function, it will return args with `[23, 1, 2]`, kwargs with `{}`
+
+        Args:
+            args(tuple): tuple of input arguments value of decorated function.
+            kwargs(dict): dict of input keyword arguments value of decorated function.
+
+        Return:
+            New arguments tuple containing default kwargs value.
+        """
+        if len(self._arg_names) < len(args):
+            error_msg = "The decorated function `{}` requires {} arguments: {}, but received {} with {}.".format(
+                self._dygraph_function.__name__,
+                len(self._arg_names), self._arg_names, len(args), args)
+            if args and inspect.isclass(args[0]):
+                error_msg += "\n\tMaybe the function has more than one decorator, we don't support this for now."
+                raise NotImplementedError(error_msg)
+            else:
+                raise ValueError(error_msg)
+
+        args = list(args)
+
+        for i in six.moves.range(len(args), len(self._arg_names)):
+            arg_name = self._arg_names[i]
+            if arg_name in kwargs:
+                args.append(kwargs[arg_name])
+                del kwargs[arg_name]
+            else:
+                if arg_name not in self._default_kwargs:
+                    raise ValueError(
+                        "`{}()` requires `{}` arguments, but not found in input `args`: {} and `kwargs`: {}.".
+                        format(self._dygraph_function.__name__, arg_name, args,
+                               kwargs))
+                args.append(self._default_kwargs[arg_name])
+
+        return tuple(args), kwargs
+
+    def args_to_input_spec(self, args, kwargs):
+        """
+        Converts input arguments into InputSpec.
+        
+        1. If specific input_spec, use them to construct feed layers.
+        2. If input_spec is None, consider all Tensor and Numpy.ndarray as feed layers
+
+        Args:
+            args(tuple): tuple of input arguments value of function containing default kwargs value.
+            kwargs(dict): kwargs arguments received by **kwargs.
+
+        Return:
+            Same nest structure with args by replacing value with InputSpec.
+        """
+        input_with_spec = []
+
+        if self._input_spec is not None:
+            # Note: Because the value type and length of `kwargs` is uncertain.
+            # So we don't support to deal this case while specificing `input_spec` currently.
+            if kwargs:
+                raise ValueError(
+                    "{} got unexpected keyword arguments: {}. Cannot trace the function when `input_spec` is specificed.".
+                    format(self._dygraph_function.__name__, kwargs))
+
+            # Note: The length of `input_spec` can be greater than `args`,
+            # because `args` may contains non-tensor value merged form `kwargs`
+            # after `unified_args_and_kwargs`.
+            if len(args) < len(self._input_spec):
+                raise ValueError(
+                    "Requires len(arguments) >= len(input_spec), but received len(args):{} < len(InputSpec): {}".
+                    format(len(args), len(self._input_spec)))
+
+            # replace argument with corresponding InputSpec.
+            input_with_spec = convert_to_input_spec(args, self._input_spec)
+        else:
+            for idx, input_var in enumerate(flatten(args)):
+                if isinstance(input_var, np.ndarray):
+                    input_var = paddle.static.InputSpec.from_numpy(input_var)
+                elif isinstance(input_var, core.VarBase):
+                    input_var = paddle.static.InputSpec.from_tensor(input_var)
+
+                input_with_spec.append(input_var)
+
+            input_with_spec = pack_sequence_as(args, input_with_spec)
+
+        return input_with_spec
+
+    @switch_to_static_graph
+    def to_static_inputs_with_spec(self, input_with_spec, main_program):
+        """
+        Constructs feed layer by inputs with InputSpec information for main program.
+
+        Args:
+            input_with_spec(tuple): input arguments by replacing argument with InputSpec.
+            main_program(Program): main program for inserting feed layer.
+        """
+        flat_input_spec = flatten(input_with_spec)
+
+        inputs = []
+        block = main_program.global_block()
+        for i, var_spec in enumerate(flat_input_spec):
+            if isinstance(var_spec, paddle.static.InputSpec):
+                feed_layer = block.create_var(
+                    # TODO(Aurelius84): consider a more elegant way to name this
+                    name=var_spec.name or "feed_%s" % i,
+                    shape=var_spec.shape,
+                    dtype=var_spec.dtype,
+                    is_data=True,
+                    need_check_feed=False)
+            else:
+                feed_layer = var_spec
+            inputs.append(feed_layer)
+
+        return pack_sequence_as(input_with_spec, inputs)
+
+    def _verify_input_spec(self, input_spec):
+        """
+        Verifies the `input_spec` and its element type is valid.
+        """
+        if not isinstance(input_spec, (tuple, list)):
+            raise TypeError(
+                "The type(input_spec) should be one of (tuple, list), but received {}.".
+                format(type_name(input_spec)))
+        input_spec = tuple(input_spec)
+        for spec in flatten(input_spec):
+            if not isinstance(spec, paddle.static.InputSpec):
+                raise ValueError(
+                    "The type(elem) from input_spec should be `InputSpec`, but received {}.".
+                    format(type_name(spec)))
+
+        return input_spec
+
+    def __repr__(self):
+        return "function: {}({}), input_spec: {}".format(
+            self._dygraph_function.__name__, ','.join(self._arg_names),
+            self._input_spec)
+
+    @property
+    def dygraph_function(self):
+        return self._dygraph_function
+
+    @property
+    def args_name(self):
+        return self._arg_names
+
+    @property
+    def input_spec(self):
+        return self._input_spec
+
+    @property
+    def flat_input_spec(self):
+        return self._flat_input_spec
+
+    @property
+    def code(self):
+        return func_to_source_code(self._dygraph_function)
+
+
+def get_parameters(layer_instance, include_sublayer=True):
+    """
+    Returns parameters of decorated layers. If set `include_sublayer` True,
+    the parameters created in sub layers will be added.
+    """
+    params = collections.OrderedDict()
+    if layer_instance is not None:
+        if isinstance(layer_instance, layers.Layer):
+            if include_sublayer:
+                params = layer_instance.parameters()
+                names = [p.name for p in params]
+                params = collections.OrderedDict(zip(names, params))
+            else:
+                params = layer_instance._parameters
+        else:
+            raise TypeError(
+                "Type of `layer_instance` should be nn.Layer, but received {}".
+                format(type_name(layer_instance)))
+
+    return params
+
+
+def get_buffers(layer_instance, include_sublayer=True):
+    """
+    Returns Variable buffers of decorated layers. If set `include_sublayer` True,
+    the Variable buffers created in sub layers will be added.
+    """
+    buffers = collections.OrderedDict()
+    if layer_instance is not None:
+        if isinstance(layer_instance, layers.Layer):
+            if include_sublayer:
+                buffers = layer_instance.buffers()
+                names = [buffer.name for buffer in buffers]
+                buffers = collections.OrderedDict(zip(names, buffers))
+            else:
+                buffers = layer_instance._buffers
+        else:
+            raise TypeError(
+                "Type of `layer_instance` should be nn.Layer, but received {}".
+                format(type_name(layer_instance)))
+    return buffers
+
+
+def convert_to_input_spec(inputs, input_spec):
+    """
+    Replaces tensor in structured `inputs` by InputSpec in `input_spec`.
+    
+    Args:
+        inputs(list|dict): nested structure list or dict.
+        input_spec(list|dict): same nested structure list or dict as inputs. 
+
+    
+    Return:
+        Same structure with inputs by replacing the element with specified InputSpec.
+    """
+
+    def check_type_and_len(input, spec, check_length=False):
+        if type(input) is not type(spec):
+            raise TypeError('type(input) should be {}, but received {}.'.format(
+                type(spec), type(input)))
+        if check_length and len(input) < len(spec):
+            raise ValueError(
+                'Requires len(inputs) >= len(input_spec), but received len(inputs):{} < len(input_spec):{}'.
+                format(len(inputs), len(input_spec)))
+
+    if isinstance(input_spec, (tuple, list)):
+        input_with_spec = []
+        check_type_and_len(inputs, input_spec, True)
+
+        for i, spec in enumerate(input_spec):
+            out_spec = convert_to_input_spec(inputs[i], spec)
+            input_with_spec.append(out_spec)
+
+        # Note: If the rest inputs contain tensor or numpy.ndarray
+        # without specific InputSpec, raise warning.
+        if len(inputs) > len(input_spec):
+            for rest_input in inputs[len(input_spec):]:
+                if isinstance(rest_input, (core.VarBase, np.ndarray)):
+                    logging.warning(
+                        "The inputs constain `{}` without specificing InputSpec, its shape and dtype will be treated immutable. "
+                        "Please specific InputSpec information in `@declarative` if you expect them as mutable inputs.".
+                        format(type_name(rest_input)))
+        input_with_spec.extend(inputs[len(input_spec):])
+
+        return input_with_spec
+    elif isinstance(input_spec, dict):
+        input_with_spec = {}
+        check_type_and_len(inputs, input_spec, True)
+        for name, input in inputs.items():
+            if name in input_spec:
+                input_with_spec[name] = convert_to_input_spec(input,
+                                                              input_spec[name])
+            else:
+                input_with_spec[name] = input
+        return input_with_spec
+    elif isinstance(input_spec, paddle.static.InputSpec):
+        return input_spec
+    else:
+        raise TypeError(
+            "The type(input_spec) should be a `InputSpec` or dict/list/tuple of it, but received {}.".
+            type_name(input_spec))
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..75cb65085846d672d2488c98bf6ad625ac12e78b
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import threading
+
+import six
+from paddle.fluid import log_helper
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
+
+__all__ = ["TranslatorLogger", "set_verbosity", "set_code_level"]
+
+VERBOSITY_ENV_NAME = 'TRANSLATOR_VERBOSITY'
+CODE_LEVEL_ENV_NAME = 'TRANSLATOR_CODE_LEVEL'
+DEFAULT_VERBOSITY = -1
+DEFAULT_CODE_LEVEL = -1
+
+
+def synchronized(func):
+    def wrapper(*args, **kwargs):
+        with threading.Lock():
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+class TranslatorLogger(object):
+    """
+    class for Logging and debugging during the tranformation from dygraph to static graph.
+    The object of this class is a singleton.
+    """
+
+    @synchronized
+    def __new__(cls, *args, **kwargs):
+        if not hasattr(cls, '_instance'):
+            cls._instance = object.__new__(cls, *args, **kwargs)
+            cls._instance._initialized = False
+        return cls._instance
+
+    def __init__(self):
+        if self._initialized:
+            return
+
+        self._initialized = True
+        self._logger = log_helper.get_logger(
+            __name__, 1, fmt='%(asctime)s-%(levelname)s: %(message)s')
+        self._verbosity_level = None
+        self._transformed_code_level = None
+
+    @property
+    def logger(self):
+        return self._logger
+
+    @property
+    def verbosity_level(self):
+        if self._verbosity_level is not None:
+            return self._verbosity_level
+        else:
+            return int(os.getenv(VERBOSITY_ENV_NAME, DEFAULT_VERBOSITY))
+
+    @verbosity_level.setter
+    def verbosity_level(self, level):
+        self.check_level(level)
+        self._verbosity_level = level
+
+    @property
+    def transformed_code_level(self):
+        if self._transformed_code_level is not None:
+            return self._transformed_code_level
+        else:
+            return int(os.getenv(CODE_LEVEL_ENV_NAME, DEFAULT_CODE_LEVEL))
+
+    @transformed_code_level.setter
+    def transformed_code_level(self, level):
+        self.check_level(level)
+        self._transformed_code_level = level
+
+    def check_level(self, level):
+        if isinstance(level, (six.integer_types, type(None))):
+            rv = level
+        else:
+            raise TypeError("Level is not an integer: {}".format(level))
+        return rv
+
+    def has_code_level(self, level):
+        level = self.check_level(level)
+        return level == self.transformed_code_level
+
+    def has_verbosity(self, level):
+        level = self.check_level(level)
+        return level >= self.verbosity_level
+
+    def error(self, msg, *args, **kwargs):
+        self.logger.error(msg, *args, **kwargs)
+
+    def warn(self, msg, *args, **kwargs):
+        self.logger.warn(msg, *args, **kwargs)
+
+    def log(self, level, msg, *args, **kwargs):
+        if self.has_verbosity(level):
+            self.logger.log(level, msg, *args, **kwargs)
+
+    def log_transformed_code(self, level, ast_node, transformer_name, *args,
+                             **kwargs):
+        if self.has_code_level(level):
+            source_code = ast_to_source_code(ast_node)
+            header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
+                .format(level, transformer_name)
+
+            msg = header_msg + source_code
+            self.logger.info(msg, *args, **kwargs)
+
+
+_TRANSLATOR_LOGGER = TranslatorLogger()
+
+
+def set_verbosity(level=0):
+    """
+    Sets the verbosity level of log for dygraph to static graph.
+    There are two means to set the logging verbosity:
+     1. Call function `set_verbosity`
+     2. Set environment variable `TRANSLATOR_VERBOSITY`
+
+    **Note**:
+    `set_verbosity` has a higher priority than the environment variable.
+
+    Args:
+        level(int): The verbosity level. The larger value idicates more verbosity.
+            The default value is 0, which means no logging.
+
+    Examples:
+        .. code-block:: python
+
+            import os
+            import paddle
+
+            paddle.jit.set_verbosity(1)
+            # The verbosity level is now 1
+
+            os.environ['TRANSLATOR_VERBOSITY'] = '3'
+            # The verbosity level is now 3, but it has no effect because it has a lower priority than `set_verbosity`
+    """
+    _TRANSLATOR_LOGGER.verbosity_level = level
+
+
+def get_verbosity():
+    return _TRANSLATOR_LOGGER.verbosity_level
+
+
+LOG_AllTransformer = 100
+
+
+def set_code_level(level=LOG_AllTransformer):
+    """
+    Sets the level to print code from specific level of Ast Transformer.
+    There are two means to set the code level:
+     1. Call function `set_code_level`
+     2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+
+    **Note**:
+    `set_code_level` has a higher priority than the environment variable.
+
+    Args:
+        level(int): The level to print code. Default is 100, which means to print the code after all AST Transformers.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.jit.set_code_level(2)
+            # It will print the transformed code at level 2, which means to print the code after second transformer,
+            # as the date of August 28, 2020, it is CastTransformer.
+
+            os.environ['TRANSLATOR_CODE_LEVEL'] = '3'
+            # The code level is now 3, but it has no effect because it has a lower priority than `set_code_level`
+
+    """
+    _TRANSLATOR_LOGGER.transformed_code_level = level
+
+
+def get_code_level():
+    return _TRANSLATOR_LOGGER.transformed_code_level
+
+
+def error(msg, *args, **kwargs):
+    _TRANSLATOR_LOGGER.error(msg, *args, **kwargs)
+
+
+def warn(msg, *args, **kwargs):
+    _TRANSLATOR_LOGGER.warn(msg, *args, **kwargs)
+
+
+def log(level, msg, *args, **kwargs):
+    _TRANSLATOR_LOGGER.log(level, msg, *args, **kwargs)
+
+
+def log_transformed_code(level, ast_node, transformer_name, *args, **kwargs):
+    _TRANSLATOR_LOGGER.log_transformed_code(level, ast_node, transformer_name,
+                                            *args, **kwargs)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index ceacba25375c64552e1e85d046ca494b078ee66d..698d989343a23015529a3b37b285640466d1c30d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -13,25 +13,23 @@
 # limitations under the License.
 
 from __future__ import print_function
-
+import gast
 import collections
+import logging
 import inspect
+import six
 import textwrap
 import threading
 import warnings
 
 import gast
-import numpy as np
-from paddle.fluid import core
-from paddle.fluid import executor
 from paddle.fluid import framework
-from paddle.fluid import scope_guard
-from paddle.fluid import unique_name
-from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph import layers
+from paddle.fluid.data_feeder import check_type
+from paddle.fluid.layers.utils import flatten
 from paddle.fluid.dygraph.base import param_guard
 from paddle.fluid.dygraph.base import switch_to_static_graph
-from paddle.fluid.dygraph.dygraph_to_static.ast_transformer import DygraphToStaticAst
+from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst
 from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
 from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
@@ -41,13 +39,19 @@ from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_progr
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_func
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
+from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
-from paddle.fluid.layers.utils import flatten
-from paddle.fluid.layers.utils import pack_sequence_as
+from paddle.fluid.dygraph.dygraph_to_static.utils import make_hashable
+from paddle.fluid.dygraph.dygraph_to_static.function_spec import FunctionSpec
+from paddle.fluid.dygraph.dygraph_to_static.function_spec import get_buffers, get_parameters
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 
 __all__ = ['ProgramTranslator', 'convert_to_static']
 
+# For each traced function, we set `max_traced_program_count` = 10 to consider caching performance.
+# Once exceeding the threshold, we will raise warning to users to make sure the conversion is as expected.
+MAX_TRACED_PROGRAM_COUNT = 10
+
 
 class FunctionCache(object):
     """
@@ -136,100 +140,323 @@ def convert_to_static(function):
         return static_func
 
 
-class FunctionSpec(object):
-    def __init__(self, func, args, kwargs):
-        self._dyfunc = func
-        self._args = args
-        self._kwargs = kwargs
+class CacheKey(object):
+    """
+    Cached key for ProgramCache.
+    """
+
+    __slots__ = ['function_spec', 'input_with_spec', 'class_instance']
 
-        # TODO(liym27): func has multi layer decorator
-        dyfunc = getattr(func, '__wrapped__', func)
-        self._dyfunc_code = inspect.getsource(dyfunc)
+    def __init__(self, function_spec, input_with_spec, class_instance):
+        """
+        Initializes a cache key.
 
-    def is_method(self):
-        return self._args and isinstance(self._args[0], layers.Layer)
+        Args:
+            functions_spec(FunctionSpec): a FunctionSpec instance of decorated function.
+            input_with_spec(list[InputSpec]): actual inputs with some arguments replaced by InputSpec.
+            class_instance(object): a instance of class `Layer`.
+        """
+        self.function_spec = function_spec
+        self.input_with_spec = input_with_spec
+        self.class_instance = class_instance
 
-    def parameters(self, include_sublayer=True):
+    @classmethod
+    def from_func_and_args(cls, function_spec, args, kwargs, class_instance):
         """
-        Returns parameters of decorated layers. If set `include_sublayer` True,
-        the parameters created in sub layers will be added.
+        Generated a CacheKey instance by given inputs.
+
+        Args:
+            functions_spec(FunctionSpec): a FunctionSpec instance of decorated function.
+            args(tuple): tuple of actual inputs arguments.
+            kwargs(dict): dict of actual inputs keyword arguments.
+            class_instance(object): a instance of class `Layer`.
         """
-        params = collections.OrderedDict()
-        if self.is_method():
-            layer_instance = self._args[0]
-            if include_sublayer:
-                params = layer_instance.parameters()
-                names = [p.name for p in params]
-                params = collections.OrderedDict(zip(names, params))
+        # 1. filter `self` in args
+        if args and isinstance(args[0], layers.Layer):
+            args = args[1:]
+        # 2. convert tensor and numpy array into InputSpec 
+        _args, _kwargs = function_spec.unified_args_and_kwargs(args, kwargs)
+        input_with_spec = function_spec.args_to_input_spec(_args, _kwargs)
+
+        # 3. check whether hit the cache or build a new program for the input arguments
+        return CacheKey(function_spec, input_with_spec, class_instance)
+
+    def __hash__(self):
+        error_msg = "Arguments to a `@paddle.jit.to_static` must be a hashable Python objects (or nested structures of these types)."
+        return hash((id(self.function_spec),
+                     make_hashable(self.input_with_spec, error_msg),
+                     self.class_instance))
+
+    def __eq__(self, other):
+        return (type(self) is type(other)) and hash(self) == hash(other)
+
+    def __neq__(self, other):
+        return not self == other
+
+    def __repr__(self):
+        return "id(function_spec): {}, input_with_spec: {}, class_instance: {}".format(
+            id(self.function_spec), self.input_with_spec, self.class_instance)
+
+
+def unwrap_decorators(func):
+    """
+    Unwraps a decorated function and returns the decorator list and inner target.
+    """
+    decorators = []
+    cur = func
+    while True:
+        if isinstance(cur, StaticLayer):
+            decorators.append(cur)
+            # Note: if `cur` is a method, keep it as bound method of class.
+            instance = cur._class_instance
+            if instance is not None:
+                cur = cur.dygraph_function.__get__(instance)
             else:
-                params = layer_instance._parameters
-        return params
+                cur = cur.dygraph_function
+        else:
+            break
+    return decorators, cur
+
 
-    def buffers(self, include_sublayer=True):
+class StaticLayer(object):
+    """
+    Wrapper class to Manage program conversion of decorated function.
+
+    """
+
+    def __init__(self, function, input_spec=None):
+        """
+        Initializes a `StaticLayer`.
+
+        Args:
+            function(callable): A function or method that will be converted into static program.
+            input_spec(list[InputSpec]): list of InputSpec to specify the `shape/dtype/name` information for each input argument, default None.
+        """
+        # save the instance `self` while decorating a method of class.
+        if inspect.ismethod(function):
+            self._dygraph_function = getattr(function, '__func__')
+            self._class_instance = getattr(function, '__self__')
+        else:
+            self._dygraph_function = function
+            self._class_instance = None
+
+        self._input_spec = input_spec
+        self._function_spec = FunctionSpec(function, input_spec)
+        self._program_cache = ProgramCache()
+        # Note: Hold a reference to ProgramTranslator for switching `enable_declarative`.
+        self._program_trans = ProgramTranslator()
+
+    def __get__(self, instance, owner):
         """
-        Returns Variable buffers of decorated layers. If set `include_sublayer` True,
-        the Variable buffers created in sub layers will be added.
+        Overrides this method to parse the class instance and call bound method correctly.
+
+        For example:
+            
+            '''
+            class Net(Layer):
+                def __init__(self):
+                    pass
+                
+                @paddle.jit.to_static
+                def forward(self, x, y):
+                    return x + y
+
+            net = Net()
+            out = net(x, y)
+            '''
+        
+        In above case, `net(x, y)` will call `net.forward(x, y)` firstly that is a bound method
+        of `Net` instance. After decorated by `@paddle.jit.to_static`, it will firstly to call `__get__`
+        to parse the class instance correctly instead of the `StaticLayer` instance.
+        """
+        self._class_instance = instance
+        return self
+
+    def __call__(self, *args, **kwargs):
         """
-        buffers = collections.OrderedDict()
-        if self.is_method():
-            layer_instance = self._args[0]
-            if include_sublayer:
-                buffers = layer_instance.buffers()
-                names = [buffer.name for buffer in buffers]
-                buffers = collections.OrderedDict(zip(names, buffers))
+        Supports to call the returned instance with input `args` and `kwargs` directly.
+
+        Args:
+            *args(tuple): tuple of all input arguments from original decorated function.
+            **kwargs(dict): dict of all input keyward arguments from original decorated function. 
+
+        Return:
+            Outputs of decorated function.
+        """
+        # 1. call dygraph function directly if not enable `declarative`
+        if not self._program_trans.enable_declarative:
+            warnings.warn(
+                "The decorator '@paddle.jit.to_static' doesn't work when setting ProgramTranslator.enable=False. "
+                "We will just return dygraph output.")
+            return self._call_dygraph_function(*args, **kwargs)
+
+        # 2. trace ops from dygraph layers and cache the generated program.
+        args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
+        try:
+            concrete_program, partial_program_layer = self.get_concrete_program(
+                *args, **kwargs)
+
+            # 3. synchronize self.training attribute.
+            if isinstance(self._class_instance, layers.Layer):
+                partial_program_layer.training = self._class_instance.training
+
+            # 4. return outputs.
+            return partial_program_layer(args)
+        except Exception as e:
+            if not hasattr(e, ERROR_DATA):
+                # runtime error
+                attach_error_data(e, in_runtime=True)
+            error_data = getattr(e, ERROR_DATA, None)
+            if error_data:
+                new_exception = error_data.create_exception()
+                if six.PY3:
+                    # NOTE(liym27):
+                    # 1. Why `raise new_exception from None`?
+                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
+                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
+                    #   caught exception.
+                    # 2. Use exec to bypass syntax error checking in Python 2.
+
+                    six.exec_("raise new_exception from None")
+                else:
+                    raise new_exception
             else:
-                buffers = layer_instance._buffers
-        return buffers
+                raise
 
-    @switch_to_static_graph
-    def to_static_inputs(self, main_program):
-        inputs = []
-        block = main_program.global_block()
-        for input_var in flatten(self.args):
-            if isinstance(input_var, np.ndarray):
-                feed_layer = block.create_var(
-                    name=unique_name.generate('feed'),
-                    shape=list(input_var.shape),
-                    dtype=input_var.dtype,
-                    is_data=True,
-                    need_check_feed=False)
-            elif isinstance(input_var, core.VarBase):
-                feed_layer = block.create_var(
-                    name=input_var.name,
-                    shape=list(input_var.shape),
-                    dtype=input_var.dtype,
-                    stop_gradient=input_var.stop_gradient,
-                    need_check_feed=False)
+    def _call_dygraph_function(self, *args, **kwargs):
+        """
+        Calls dygraph function directly and returns the outputs.
+
+        Args:
+            *args(tuple): tuple of all input arguments from original decorated function.
+            **kwargs(dict): dict of all input keyward arguments from original decorated function. 
+
+        Return:
+            Outputs of dygraph function.
+        """
+        if self._class_instance is not None:
+            dygraph_function = self._dygraph_function.__get__(
+                self._class_instance)
+        else:
+            dygraph_function = self._dygraph_function
+
+        return dygraph_function(*args, **kwargs)
+
+    def get_concrete_program(self, *args, **kwargs):
+        """
+        Returns traced concrete program and inner executable partial layer.
+
+        Args:
+            *args(tuple): input arguments values or InputSpec
+            **kwargs(dict) : input kwargs values.
+
+        Returns:
+            Traced ConcreteProgram and executable translated Layer.
+        """
+        # 1. unify args/kwargs and replace Tensor with InputSpec
+        if len(args) != len(self._function_spec.args_name):
+            args, kwargs = self._function_spec.unified_args_and_kwargs(args,
+                                                                       kwargs)
+        input_with_spec = self._function_spec.args_to_input_spec(args, kwargs)
+
+        # 2. generate cache key
+        cache_key = CacheKey(self._function_spec, input_with_spec,
+                             self._class_instance)
+
+        # 3. check whether hit the cache or build a new program for the input arguments
+        concrete_program, partial_program_layer = self._program_cache[cache_key]
+        return concrete_program, partial_program_layer
+
+    def get_traced_count(self):
+        """
+        Returns the number of traced programs for the decorated function.
+        """
+        return len(self._program_cache)
+
+    @property
+    def code(self):
+        """
+        Returns the source code of transformed static function for debugging.
+        """
+        static_func = convert_to_static(self._dygraph_function)
+        source_code = func_to_source_code(static_func)
+        return source_code
+
+    @property
+    def dygraph_function(self):
+        """
+        Returns the original decorated function.
+        """
+        return self._dygraph_function
+
+    @property
+    def concrete_program(self):
+        """
+        Returns recent ConcreteProgram instance of decorated function.
+        """
+        # if specific the `input_spec`, the length of program_cache will always 1,
+        # else, return the last one.
+        cached_program_len = len(self._program_cache)
+        # If specific `input_spec`, apply convertion from dygraph layers into static Program.
+        if cached_program_len == 0:
+            if len(self._function_spec.flat_input_spec) > 0:
+                input_spec = self._function_spec.input_spec
+                concrete_program, _ = self.get_concrete_program(*input_spec)
+                return concrete_program
             else:
-                feed_layer = input_var
+                raise ValueError("No valid transformed program for {}".format(
+                    self._function_spec))
+        # If more than one programs have been cached, return the recent converted program by default.
+        elif cached_program_len > 1:
+            logging.warning(
+                "Current {} has more than one cached programs: {}, the last traced progam will be return by default.".
+                format(self._function_spec, cached_program_len))
+
+        cache_key, (concrete_program,
+                    partial_layer) = self._program_cache.last()
+        return concrete_program
 
-            inputs.append(feed_layer)
-        # Restores the nested structure as self.args
-        return pack_sequence_as(self.args, inputs)
+    @property
+    def inputs(self):
+        """
+        Returns input tensors of recent converted static program.
+        """
+        concrete_program = self.concrete_program
+        inputs = [
+            var for var in flatten(concrete_program.inputs)
+            if isinstance(var, framework.Variable)
+        ]
+        return inputs
 
     @property
-    def dyfunc(self):
-        return self._dyfunc
+    def outputs(self):
+        """
+        Returns output tensors of recent converted static program.
+        """
+        concrete_program = self.concrete_program
+        outputs = [
+            var for var in flatten(concrete_program.outputs)
+            if isinstance(var, framework.Variable)
+        ]
+
+        return outputs
 
     @property
-    def args(self):
-        return self._args
-
-    def __key(self):
-        # Note: if dygraph function is a method of class,
-        # consider instance info as hash key.
-        if self.is_method():
-            # NOTE: we can use Layer's (instance + function code) as hash key.
-            # An instance will not hold two identical methods 
-            return self._dyfunc_code, self._args[0]
-        else:
-            return self._dyfunc
+    def main_program(self):
+        """
+        Returns recent converted static main program.
+        """
+        concrete_program = self.concrete_program
+        main_program = concrete_program.main_program
+        return main_program
 
-    def __hash__(self):
-        return hash(self.__key())
+    @property
+    def program_cache(self):
+        return self._program_cache
 
-    def __eq__(self, other):
-        return self.__key() == self.__key()
+    @property
+    def function_spec(self):
+        return self._function_spec
 
 
 # Flag that indicates whether running code under `@declarative`
@@ -255,11 +482,17 @@ def _switch_declarative_mode_guard_(is_declarative=True):
 
 
 class ConcreteProgram(object):
+
+    __slots__ = [
+        'inputs', 'outputs', 'main_program', "startup_program", "parameters",
+        "function"
+    ]
+
     def __init__(self,
                  inputs,
                  outputs,
                  parameters,
-                 func,
+                 function,
                  main_program,
                  startup_program=None):
         self.inputs = inputs
@@ -267,17 +500,21 @@ class ConcreteProgram(object):
         self.main_program = main_program
         self.startup_program = startup_program
         self.parameters = parameters
-        self.func_spec = func
+        self.function = function
 
     @staticmethod
     @switch_to_static_graph
-    def from_func_spec(func_spec):
+    def from_func_spec(func_spec, input_spec, class_instance):
         """
         Builds the main_program with specialized inputs and returns outputs
         of program as fetch_list.
+
+        Args:
+            func_spec(FunctionSpec): A FunctionSpec instance for decorated function.
+            input_spec(list[InputSpec]): 
         """
         # Transforms dygraph function into static function and caches it.
-        dygraph_function = func_spec.dyfunc
+        dygraph_function = func_spec.dygraph_function
         static_func = convert_to_static(dygraph_function)
 
         main_program, startup_program = framework.Program(), framework.Program()
@@ -291,15 +528,20 @@ class ConcreteProgram(object):
         with framework.program_guard(main_program, startup_program):
             with _switch_declarative_mode_guard_(is_declarative=True):
                 # 1. Adds `fluid.data` layers for input if needed
-                inputs = func_spec.to_static_inputs(main_program)
+                inputs = func_spec.to_static_inputs_with_spec(input_spec,
+                                                              main_program)
+                if class_instance:
+                    inputs = tuple([class_instance] + list(inputs))
 
                 # 2. Gets all ParamBases and buffered VarBases in the function
-                all_parameters_and_buffers = list(func_spec.parameters().values(
-                )) + list(func_spec.buffers().values())
+                all_parameters_and_buffers = list(
+                    get_parameters(class_instance).values()) + list(
+                        get_buffers(class_instance).values())
 
                 # 3. Builds program only once and returns the output Variables.
-                with param_guard(func_spec.parameters(False)), param_guard(
-                        func_spec.buffers(False)):
+                with param_guard(get_parameters(
+                        class_instance, False)), param_guard(
+                            get_buffers(class_instance, False)):
                     try:
                         outputs = static_func(*inputs)
                     except BaseException as e:
@@ -317,7 +559,7 @@ class ConcreteProgram(object):
             inputs=inputs,
             outputs=outputs,
             parameters=all_parameters_and_buffers,
-            func=dygraph_function,
+            function=dygraph_function,
             main_program=main_program,
             startup_program=startup_program)
 
@@ -330,27 +572,38 @@ class ProgramCache(object):
     def __init__(self):
         self._caches = collections.OrderedDict()
 
-    def _build_once(self, func_spec):
-        concrete_program = ConcreteProgram.from_func_spec(func_spec)
+    def _build_once(self, cache_key):
+        concrete_program = ConcreteProgram.from_func_spec(
+            func_spec=cache_key.function_spec,
+            input_spec=cache_key.input_with_spec,
+            class_instance=cache_key.class_instance)
         return concrete_program, partial_program_from(concrete_program)
 
     def __getitem__(self, item):
-        if not isinstance(item, FunctionSpec):
-            raise ValueError(
-                'type(item) should be FunctionSpec, but received %s' %
-                type(item))
+        if not isinstance(item, CacheKey):
+            raise ValueError('type(item) should be CacheKey, but received %s' %
+                             type_name(item))
+
         if item not in self._caches:
             self._caches[item] = self._build_once(item)
+            # Note: raise warnings if number of traced program is more than `max_tracing_count`
+            current_tracing_count = len(self._caches)
+            if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
+                logging.warning(
+                    "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
+                    "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".
+                    format(current_tracing_count, MAX_TRACED_PROGRAM_COUNT))
+
         return self._caches[item]
 
     def get_program(self, item):
-        if not isinstance(item, FunctionSpec):
+        if not isinstance(item, CacheKey):
             raise ValueError(
                 "Input item's type should be FunctionSpec, but received %s" %
-                type(item))
+                type_name(item))
         if item not in self._caches:
             raise RuntimeError(
-                "Failed to find program for input item, please decorate input function by `@declarative`."
+                "Failed to find program for input item, please decorate input function by `@paddle.jit.to_static`."
             )
         return self._caches[item]
 
@@ -360,6 +613,12 @@ class ProgramCache(object):
         key = next(reversed(self._caches.keys()))
         return key, self._caches[key]
 
+    def __len__(self):
+        return len(self._caches)
+
+    def concrete_programs(self):
+        return [cp for key, (cp, _) in self._caches.iteritems()]
+
 
 def synchronized(func):
     func.__lock__ = threading.Lock()
@@ -508,9 +767,11 @@ class ProgramTranslator(object):
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
 
-        function_spec = FunctionSpec(dygraph_func, args, kwargs)
-        concrete_program, partial_program_layer = self._program_cache[
-            function_spec]
+        function_spec = FunctionSpec(dygraph_func)
+        cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
+                                                getattr(dygraph_func,
+                                                        '__self__', None))
+        _, partial_program_layer = self._program_cache[cache_key]
 
         if args and isinstance(args[0], layers.Layer):
             # Synchronize self.training attribute.
@@ -624,8 +885,12 @@ class ProgramTranslator(object):
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
 
-        func_spec = FunctionSpec(dygraph_func, args, kwargs)
-        concrete_program, _ = self._program_cache[func_spec]
+        function_spec = FunctionSpec(dygraph_func)
+        cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
+                                                getattr(dygraph_func,
+                                                        '__self__', None))
+        concrete_program, partial_program_layer = self._program_cache[cache_key]
+
         # Note: concrete_program hold all input/output infos include non-Variable
         input_vars = [
             var for var in concrete_program.inputs
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 21e05bc6faf10110fae385b525e72e38a04da925..ba02a983f8e641079d8a60b166a6f098e6f725a8 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -18,12 +18,14 @@ import ast
 import astor
 import atexit
 import copy
+import collections
 import gast
 import inspect
 import os
 import six
 import tempfile
 import textwrap
+import numpy as np
 
 from paddle.fluid import unique_name
 
@@ -46,6 +48,77 @@ dygraph_class_to_static_api = {
 FOR_ITER_INDEX_PREFIX = '__for_loop_var_index'
 FOR_ITER_VAR_LEN_PREFIX = '__for_loop_var_len'
 
+# FullArgSpec is valid from Python3. Defined a Namedtuple to
+# to make it available in Python2.
+FullArgSpec = collections.namedtuple('FullArgSpec', [
+    'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults',
+    'annotations'
+])
+
+
+def getfullargspec(target):
+    if hasattr(inspect, "getfullargspec"):
+        return inspect.getfullargspec(target)
+    else:
+        argspec = inspect.getargspec(target)
+        return FullArgSpec(
+            args=argspec.args,
+            varargs=argspec.varargs,
+            varkw=argspec.keywords,
+            defaults=argspec.defaults,
+            kwonlyargs=[],
+            kwonlydefaults=None,
+            annotations={})
+
+
+def parse_arg_and_kwargs(function):
+    """
+    Returns full argument names as list. e.g ['x', 'y', 'z']
+    """
+    fullargspec = getfullargspec(function)
+    arg_names = fullargspec.args
+    if arg_names and 'self' == arg_names[0]:
+        arg_names = fullargspec.args[1:]
+
+    # parse default kwargs
+    default_kwargs = {}
+    default_values = fullargspec.defaults
+    if default_values:
+        assert len(default_values) <= len(arg_names)
+        default_kwarg_names = arg_names[-len(default_values):]
+        default_kwargs = dict(zip(default_kwarg_names, default_values))
+
+    return arg_names, default_kwargs
+
+
+def type_name(v):
+    return type(v).__name__
+
+
+def make_hashable(x, error_msg=None):
+    """
+    Makes input `x` hashable.
+
+    For some unhashable objects, such as `dict/list/np.ndarray`,applying hash function by using their values.
+    """
+    if isinstance(x, (tuple, list)):
+        return tuple(map(make_hashable, x))
+
+    try:
+        hash(x)
+    except TypeError:
+        if isinstance(x, np.ndarray):
+            # Note: `tostring()` will return the binary data from np.ndarray that
+            # means different value will lead to different hash code.
+            return hash(x.tostring())
+        elif isinstance(x, dict):
+            return tuple(map(make_hashable, x.values()))
+
+        error_msg = error_msg or "Requires a hashable object."
+        raise ValueError(error_msg + " But received type: %s" % type_name(x))
+
+    return x
+
 
 def _is_api_in_module_helper(obj, module_prefix):
     m = inspect.getmodule(obj)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index ba27b2d1c631428c07a5832125cdc18634e9a4b5..7f3d450a49c7d3fcc9ca1d3c2d7c5eb732671c6c 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -378,7 +378,7 @@ def _load_persistable_vars_by_program(model_path,
             new_var = framework._varbase_creator(
                 type=each_var.type(),
                 name=each_var.name(),
-                shpae=each_var.shape(),
+                shape=each_var.shape(),
                 dtype=each_var.dtype(),
                 persistable=True)
         if params_filename is None:
@@ -636,7 +636,7 @@ class TranslatedLayer(layers.Layer):
             )
         if not isinstance(persistable_vars, dict):
             raise TypeError(
-                "TranslatedLayer need to use persisatbale variable dict for initialization."
+                "TranslatedLayer need to use persistable variable dict for initialization."
             )
 
         self._program_holder_dict = programs
@@ -685,7 +685,7 @@ class TranslatedLayer(layers.Layer):
         # 1. load program desc & construct _ProgramHolder
         programs = _construct_program_holders(model_path, model_filename)
 
-        # 2. load layer parameters & parameter attirbutes
+        # 2. load layer parameters & parameter attributes
         persistable_vars = _construct_params_and_buffers(
             model_path, programs, separate_params, params_filename)
 
@@ -753,7 +753,7 @@ class TranslatedLayer(layers.Layer):
                                          core.VarDesc.VarType.STEP_SCOPES, True)
             tmp_scope_vec.value().set_scope(program_holder.scope)
 
-            # 2. run prorgam by op
+            # 2. run program by op
             trace_program = program_holder.infer_program if self._is_test else program_holder.train_program
             end_op_index = program_holder.infer_program.block(0).op_size()
             framework._dygraph_tracer().trace_op(
@@ -774,7 +774,7 @@ class TranslatedLayer(layers.Layer):
             # will be SelectedRows, not LoDTensor. But tracer will just
             # set param grad VarBase by forward VarBase(LoDTensor)
             # If we don't change grad_var type here, RunProgramOp need
-            # transform SelectedRows to LoDTensor forcely, it may not
+            # transform SelectedRows to LoDTensor forcibly, it may not
             # be user wanted result.
             for persistable_var in persistable_vars:
                 grad_var_name = var.name + core.grad_var_suffix()
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 5a291df4700ad1336e8e850bd3674b1a8d9df979..853c16a5d0f7129f097f7fca860ab260f9dc7fd5 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -19,12 +19,13 @@ import pickle
 import warnings
 
 import six
+import paddle
 from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
-from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import FunctionSpec, ProgramTranslator
+from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticLayer, unwrap_decorators
 from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
@@ -33,7 +34,10 @@ from paddle.fluid.framework import _current_expected_place, _dygraph_guard, _dyg
 from paddle.fluid.framework import dygraph_only, in_dygraph_mode
 from paddle.fluid.wrapped_decorator import wrap_decorator
 
-__all__ = ['TracedLayer', 'declarative', 'dygraph_to_static_func']
+__all__ = [
+    'TracedLayer', 'declarative', 'dygraph_to_static_func', 'set_code_level',
+    'set_verbosity'
+]
 
 
 def create_program_from_desc(program_desc):
@@ -128,7 +132,27 @@ def _dygraph_to_static_func_(dygraph_func):
 dygraph_to_static_func = wrap_decorator(_dygraph_to_static_func_)
 
 
-def _declarative_(dygraph_func):
+def copy_decorator_attrs(original_func, decorated_obj):
+    """
+    Copies some necessary attributes from original function into decorated function.
+
+    Args:
+        original_func(callable): the original decorated function.
+        decorated_obj(StaticLayer): the target decorated StaticLayer object.
+    """
+    decorator_name = "declarative"
+
+    decorated_obj.__name__ = original_func.__name__
+    decorated_obj._decorator_name = decorator_name
+    decorated_obj.__wrapped__ = original_func
+    decorated_obj.__doc__ = original_func.__doc__
+    if hasattr(original_func, "__module__"):
+        decorated_obj.__module__ = original_func.__module__
+
+    return decorated_obj
+
+
+def declarative(function=None, input_spec=None):
     """
     Converts imperative dygraph APIs into declarative function APIs. Decorator
     @declarative handles the Program and Executor of static mode and returns
@@ -138,7 +162,9 @@ def _declarative_(dygraph_func):
     converted into declarative function as well.
 
     Args:
-        dygraph_func (callable): callable imperative function.
+        function (callable): callable imperative function.
+        input_spec(list[InputSpec]): list of InputSpec to specific the shape/dtype/name
+            information of each input Tensor.
 
     Returns:
         Tensor(s): containing the numerical result.
@@ -167,37 +193,27 @@ def _declarative_(dygraph_func):
 
     """
 
-    def __impl__(*args, **kwargs):
-        program_translator = ProgramTranslator()
-        if not program_translator.enable_declarative:
-            warnings.warn(
-                "The decorator 'declarative' doesn't work when setting ProgramTranslator.enable=False. "
-                "We will just return dygraph output.")
-            return dygraph_func(*args, **kwargs)
-        try:
-            return program_translator.get_output(dygraph_func, *args, **kwargs)
-        except Exception as e:
-            error_data = getattr(e, ERROR_DATA, None)
-            if error_data:
-                new_exception = error_data.create_exception()
-                if six.PY3:
-                    # NOTE(liym27):
-                    # 1. Why `raise new_exception from None`?
-                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
-                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
-                    #   caught exception.
-                    # 2. Use exec to bypass syntax error checking in Python 2.
-
-                    six.exec_("raise new_exception from None")
-                else:
-                    raise new_exception
-            else:
-                raise
+    def decorated(python_func):
+        """
+        Decorates a python function into a StaticLayer object.
+        """
+        # Step 1. unwrap the function if it is already decorated.
+        _, python_func = unwrap_decorators(python_func)
 
-    return __impl__
+        # Step 2. copy some attributes from original python function.
+        static_layer = copy_decorator_attrs(
+            original_func=python_func,
+            decorated_obj=StaticLayer(
+                function=python_func, input_spec=input_spec))
+
+        return static_layer
 
+    # for usage: `declarative(foo, ...)`
+    if function is not None:
+        return decorated(function)
 
-declarative = wrap_decorator(_declarative_)
+    # for usage: `@declarative`
+    return decorated
 
 
 class SaveLoadConfig(object):
@@ -339,7 +355,7 @@ class SaveLoadConfig(object):
                 # use SaveLoadconfig.output_spec
                 model_path = "simplenet.example.model.output_spec"
                 configs = fluid.dygraph.jit.SaveLoadConfig()
-                # only keep the predicted output in saved model, diccard loss
+                # only keep the predicted output in saved model, discard loss
                 configs.output_spec = [out]
 
                 fluid.dygraph.jit.save(
@@ -374,7 +390,7 @@ class SaveLoadConfig(object):
         The name of file to save the translated program of target Layer.
         Default filename is :code:`__model__` .
 
-        Exampels:
+        Examples:
             .. code-block:: python
 
                 import numpy as np
@@ -444,7 +460,7 @@ class SaveLoadConfig(object):
         The name of file to save all persistable variables in target Layer. 
         Default file name is :code:`__variables__` .
         
-        Exampels:
+        Examples:
             .. code-block:: python
 
                 import numpy as np
@@ -597,7 +613,7 @@ def save(layer, model_path, input_spec=None, configs=None):
     The default saved translated program file name is ``__model__``,
     and the default saved persistable variables file name is ``__variables__``,
     and it also saved some additional variable description information to file 
-    ``__varibales.info__``, these additional information is used in fine-tuning.
+    ``__variables.info__``, these additional information is used in fine-tuning.
 
     The saved model can be loaded by follow APIs:
       - :ref:`api_imperative_jit_load`
@@ -607,7 +623,7 @@ def save(layer, model_path, input_spec=None, configs=None):
     Args:
         layer (Layer): the Layer to be saved. The Layer should be decorated by `@declarative`.
         model_path (str): the directory to save the model.
-        input_spec (list[Varibale], optional): Describes the input of the saved model. 
+        input_spec (list[Variable], optional): Describes the input of the saved model. 
             It is the example inputs that will be passed to saved TranslatedLayer's forward
             function. If None, all input variables of the original Layer's forward function
             would be the inputs of the saved model. Default None.
@@ -721,16 +737,17 @@ def save(layer, model_path, input_spec=None, configs=None):
                 "The input input_spec should be 'list', but received input_spec's type is %s."
                 % type(input_spec))
         for var in input_spec:
-            if not isinstance(var, core.VarBase):
+            if not isinstance(var, (core.VarBase, Variable,
+                                    paddle.static.InputSpec)):
                 raise TypeError(
-                    "The element in input_spec list should be 'Variable', but received element's type is %s."
+                    "The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s."
                     % type(var))
 
     # 2. get program of declarative Layer.forward
-    prog_cache = prog_translator.get_program_cache()
-    # make dummy args & kwargs, to get excepted FunctionSpec
-    layer_func = FunctionSpec(type(layer).forward, [layer], {})
-    concrete_program, _ = prog_cache.get_program(layer_func)
+    if not isinstance(layer.forward, StaticLayer):
+        raise RuntimeError(
+            "layer.forward need to be decorated by `@declarative`.")
+    concrete_program = layer.forward.concrete_program
 
     # NOTE: we maintain the mapping of variable name to
     # structured name, the buffer variable (non-persistable)
@@ -814,7 +831,7 @@ def load(model_path, configs=None):
         For some historical reasons, if you load model saved by :ref:`api_fluid_io_save_inference_model`,
         there will be the following limitations when using it in fine-tuning:
         1. Imperative mode do not support LoDTensor. All original model's feed targets or parametars that depend on LoD are temporarily unavailable.
-        2. All saved model's feed targets need to be passed into TranslatedLayer's forwrad function.
+        2. All saved model's feed targets need to be passed into TranslatedLayer's forward function.
         3. The variable's ``stop_gradient`` information is lost and can not be recovered.
         4. The parameter's ``trainable`` information is lost and can not be recovered.
 
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index dc3403358b6af25d5da001282fffe53be8bfd3d9..a14c3a81c121758ed90450cd5eb5990f3f7739e1 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -31,6 +31,7 @@ from ..data_feeder import check_variable_and_dtype, check_type
 import numpy as np
 import numbers
 import logging
+import paddle.utils.deprecated as deprecated
 
 __all__ = [
     'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding',
@@ -2445,6 +2446,10 @@ class BilinearTensorProduct(layers.Layer):
             dtype=self._dtype,
             is_bias=True)
 
+    @deprecated(
+        since="2.0.0",
+        update_to="paddle.nn.Bilinear",
+        reason="New name and new args in Bilinear, easier to use.")
     def forward(self, x, y):
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'BilinearTensorProduct')
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 54d2cda4ca6858c46140e1fbf6ac8860c3a7c78d..bd578e6ba98a0f31a952bd5620b90e9464fe8666 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -11,21 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 import six
 import numpy as np
+import warnings
 from collections import OrderedDict
-from .. import core
-from . import layers
-from . import parallel_helper
-from .. import framework
-from . import to_variable, no_grad
+
+from paddle.fluid import core
+from paddle.fluid import framework
+from paddle.fluid.dygraph import layers
+from paddle.fluid.dygraph import parallel_helper
+from paddle.fluid.dygraph import to_variable, no_grad
+from paddle.utils import deprecated
 
 __all__ = ["prepare_context", "ParallelEnv", "DataParallel"]
 
 ParallelStrategy = core.ParallelStrategy
 
 
+@deprecated(since="2.0.0", update_to="paddle.distributed.init_parallel_env")
 def prepare_context(strategy=None):
     '''
     :api_attr: imperative
@@ -39,17 +44,18 @@ def prepare_context(strategy=None):
     if strategy.nranks < 2:
         return
     assert framework.in_dygraph_mode() is True, \
-        "dygraph.prepare_context should be used with dygrahp mode."
+        "dygraph.prepare_context should be used with dygraph mode."
     place = framework._current_expected_place()
     assert place is not None, \
         "dygraph.prepare_context should be used in fluid.dygraph.guard(place) guard."
-    if isinstance(place, core.CUDAPlace):
-        parallel_helper._set_parallel_ctx(
-            core.NCCLParallelContext(strategy, place))
-    else:
-        # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
-        assert ("Only support CUDAPlace for now.")
-    parallel_helper._init_parallel_ctx()
+    if not parallel_helper._is_parallel_ctx_initialized():
+        if isinstance(place, core.CUDAPlace):
+            parallel_helper._set_parallel_ctx(
+                core.NCCLParallelContext(strategy, place))
+        else:
+            # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
+            assert ("Only support CUDAPlace for now.")
+        parallel_helper._init_parallel_ctx()
     return strategy
 
 
@@ -112,84 +118,84 @@ class ParallelEnv(object):
     """
 
     def __init__(self):
-        self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
-        self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-        self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        self._device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                             "").split(",")
         self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
 
     @property
-    def nranks(self):
+    def rank(self):
         """
-        The number of trainers, generally refers to the number of GPU cards used in training.
+        Rank of current trainer.
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM. The default value is 1.
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . The default value is 0.
 
         Examples:
           .. code-block:: python
 
-            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
-            import paddle.fluid as fluid
+            # execute this command in terminal: export PADDLE_TRAINER_ID=0
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
-            print("The nranks is %d" % env.nranks)
-            # The nranks is 4
+            env = dist.ParallelEnv()
+            print("The rank is %d" % env.rank)
+            # The rank is 0
         """
-        return self._nranks
+        return self._rank
 
     @property
-    def local_rank(self):
+    def world_size(self):
         """
-        The current trainer number.
+        The number of trainers (number of processes participating in current job).
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINER_ID. The default value is 0.
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . The default value is 1.
 
         Examples:
           .. code-block:: python
 
-            # execute this command in terminal: export PADDLE_TRAINER_ID=0
-            import paddle.fluid as fluid
+            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
-            print("The local rank is %d" % env.local_rank)
-            # The local rank is 0
+            env = dist.ParallelEnv()
+            print("The world_size is %d" % env.world_size)
+            # The world_size is 4
         """
-        return self._local_rank
+        return self._world_size
 
     @property
-    def dev_id(self):
+    def device_id(self):
         """
         The ID of selected GPU card for parallel training.
 
-        Its value is equal to the value of the environment variable FLAGS_selected_gpus. The default value is 0.
+        Its value is equal to the value of the environment variable ``FLAGS_selected_gpus`` . The default value is 0.
 
         Examples:
           .. code-block:: python
 
             # execute this command in terminal: export FLAGS_selected_gpus=1
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
-            print("The device id are %d" % env.dev_id)
+            env = dist.ParallelEnv()
+            print("The device id are %d" % env.device_id)
             # The device id are 1
         """
-        return self._dev_id
+        return self._device_id
 
     @property
     def current_endpoint(self):
         """
         The endpoint of current trainer, it is in the form of (node IP + port).
 
-        Its value is equal to the value of the environment variable PADDLE_CURRENT_ENDPOINT. The default value is "".
+        Its value is equal to the value of the environment variable ``PADDLE_CURRENT_ENDPOINT`` . The default value is "".
 
         Examples:
           .. code-block:: python
             
             # execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
+            env = dist.ParallelEnv()
             print("The current endpoint are %s" % env.current_endpoint)
             # The current endpoint are 127.0.0.1:6170
         """
@@ -201,20 +207,25 @@ class ParallelEnv(object):
         The endpoints of all trainer nodes in the task, 
         which are used to broadcast the NCCL ID when NCCL2 is initialized.
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINER_ENDPOINTS. The default value is "".
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ENDPOINTS`` . The default value is "".
 
         Examples:
           .. code-block:: python
 
             # execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
+            env = dist.ParallelEnv()
             print("The trainer endpoints are %s" % env.trainer_endpoints)
             # The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171']
         """
         return self._trainer_endpoints
 
+    # [aliases] Compatible with old method names
+    local_rank = rank
+    nranks = world_size
+    dev_id = device_id
+
 
 # NOTE: [ Compatible ] Originally this class name is `Env`. The semantics of the old class names
 # are inaccurate and may confuse users, so replace it with `ParallelEnv`, but to be compatible
@@ -227,61 +238,98 @@ class DataParallel(layers.Layer):
     Run the dygraph module with data parallelism.
 
     Currently, DataParallel class only supports to run the dynamic graph
-    with multi-process. The usage is:
-    `python -m paddle.distributed.launch --selected_gpus=0,1 dynamic_graph_test.py`.
-    And the content of `dynamic_graph_test.py` is the code of examples.
+    with multi-process. 
+    
+    Now supports two ways to start training:
+
+    1. start by ``paddle.distributed.spawn`` method, for example:
+
+        ``python demo.py`` (spawn need to be called in ``__main__`` method)
+    
+    2. start by ``paddle.distributed.launch`` module, for example:
+    
+        ``python -m paddle.distributed.launch --selected_gpus=0,1 demo.py`` .
+
+    And the content of `demo.py` is the code of examples.
 
     Args:
         layers(Layer): The module that should be executed by data parallel.
-        strategy(ParallelStrategy): The strategy of data parallelism, contains 
-            environment configuration related to parallel execution.
-
+        strategy(ParallelStrategy, optional): (deprecated) The strategy of data parallelism, 
+            contains environment configuration related to parallel execution. Default: None.
+            
     Returns:
         Layer: The data paralleled module.
 
     Examples:
         .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
 
-            place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-            with fluid.dygraph.guard(place):
-
-                # prepare the data parallel context
-                strategy = fluid.dygraph.prepare_context()
-
-                linear = fluid.dygraph.Linear(1, 10, act="softmax")
-                adam = fluid.optimizer.AdamOptimizer(
-                    learning_rate=0.001, parameter_list=linear.parameters())
-
-                # make the module become the data parallelism module
-                linear = fluid.dygraph.DataParallel(linear, strategy)
-
-                x_data = np.random.random(size=[10, 1]).astype(np.float32)
-                data = fluid.dygraph.to_variable(x_data)
-
-                hidden = linear(data)
-                avg_loss = fluid.layers.mean(hidden)
-
-                # scale the loss according to the number of trainers.
-                avg_loss = linear.scale_loss(avg_loss)
-
-                avg_loss.backward()
-
-                # collect the gradients of trainers.
-                linear.apply_collective_grads()
-
-                adam.minimize(avg_loss)
-                linear.clear_gradients()
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                dist.init_parallel_env()
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = paddle.DataParallel(layer)
+
+                loss_fn = nn.MSELoss()
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                # 1. start by ``paddle.distributed.spawn`` (default)
+                dist.spawn(train, nprocs=2)
+                # 2. start by ``paddle.distributed.launch``
+                # train()
     """
 
-    def __init__(self, layers, strategy):
+    def __init__(self, layers, strategy=None):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
         self._layers = layers
-        self._strategy = strategy
+
+        # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
+        # It just stores some environment variables, which can be constructed by 
+        # ParallelEnv. Here it is set as an optional argument.
+        # This parameter is not removed because of compatibility with 1.x writing.
+        if strategy is not None:
+            self._strategy = strategy
+        else:
+            self._strategy = ParallelStrategy()
+            self._strategy.nranks = ParallelEnv().nranks
+            self._strategy.local_rank = ParallelEnv().local_rank
+            self._strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+            self._strategy.current_endpoint = ParallelEnv().current_endpoint
 
     def forward(self, *inputs, **kwargs):
         return self._layers(*inputs, **kwargs)
diff --git a/python/paddle/fluid/dygraph/parallel_helper.py b/python/paddle/fluid/dygraph/parallel_helper.py
index f378211de2b8a1579ab139318cdc3cb8d5bdc2de..ff1675f0ae8a40b2487d5834b262a1b730641262 100644
--- a/python/paddle/fluid/dygraph/parallel_helper.py
+++ b/python/paddle/fluid/dygraph/parallel_helper.py
@@ -23,6 +23,11 @@ def _is_data_parallel_mode():
         os.getenv("PADDLE_TRAINERS_NUM", "1")) > 1
 
 
+def _is_parallel_ctx_initialized():
+    global __parallel_ctx__clz__
+    return __parallel_ctx__clz__ is not None
+
+
 def _set_parallel_ctx(nccl_parallel_context):
     global __parallel_ctx__clz__
     assert __parallel_ctx__clz__ is None, \
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 9dbaab2580d21397fa7a4e03b03a5f1c4ac887f2..7cb17843396a6ed79c36126172a253864dbf3d0f 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -15,7 +15,6 @@
 import inspect
 from .. import framework
 from .. import core
-from . import BackwardStrategy
 from ..framework import Variable, Parameter, ParamBase
 from .base import switch_to_static_graph
 import numpy as np
@@ -129,19 +128,18 @@ def monkey_patch_varbase():
                                       framework._current_expected_place())
 
     @framework.dygraph_only
-    def backward(self, backward_strategy=None, retain_graph=False):
+    def backward(self, retain_graph=False):
         """
         **Notes**:
             **This API is ONLY available in Dygraph mode**
 
-        Run backward of current Graph which starts from current Variable
+        Run backward of current Graph which starts from current Tensor.
 
         Args:
-            backward_strategy( :ref:`api_fluid_dygraph_BackwardStrategy` ): The Backward Strategy to run backward
             retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
-            like to add more ops to the built graph after calling this method(`backward`), set the parameter
-            `retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
-            Defaults to False.
+                like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
+                :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+                Defaults to False.
 
         Returns:
             NoneType: None
@@ -149,32 +147,25 @@ def monkey_patch_varbase():
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
                 import numpy as np
+                import paddle
+                paddle.disable_static()
 
                 x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    inputs2 = []
-                    for _ in range(10):
-                        tmp = fluid.dygraph.base.to_variable(x)
-                        # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
-                        # there is no one need gradient on it.
-                        tmp.stop_gradient=False
-                        inputs2.append(tmp)
-                    ret2 = fluid.layers.sums(inputs2)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                inputs = []
+                for _ in range(10):
+                    tmp = paddle.to_tensor(x)
+                    # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
+                    # there is no one need gradient on it.
+                    tmp.stop_gradient=False
+                    inputs.append(tmp)
+                ret = paddle.sums(inputs)
+                loss = paddle.reduce_sum(ret)
+                loss.backward()
 
         """
         if framework.in_dygraph_mode():
-            if backward_strategy is None:
-                backward_strategy = BackwardStrategy()
-                backward_strategy.sort_sum_gradient = False
-
-            self._run_backward(backward_strategy,
-                               framework._dygraph_tracer(), retain_graph)
+            self._run_backward(framework._dygraph_tracer(), retain_graph)
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
@@ -205,9 +196,7 @@ def monkey_patch_varbase():
                         inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
 
         """
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index ef50294b8e762ae84f9b37f2571458e6588c4bc6..fc4e91aad4fff1db325e17828d26ccd94c164c3d 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1106,15 +1106,18 @@ class Variable(object):
         pass
 
     @fake_interface_only
-    def backward(self, backward_strategy=None):
+    def backward(self, retain_graph=False):
         """
         **Notes**:
             **This API is ONLY available in Dygraph mode**
 
-        Run backward of current Graph which starts from current Variable
+        Run backward of current Graph which starts from current Tensor.
 
         Args:
-            backward_strategy( :ref:`api_fluid_dygraph_BackwardStrategy` ): The Backward Strategy to run backward
+            retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
+                like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
+                :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+                Defaults to False.
 
         Returns:
             NoneType: None
@@ -1122,23 +1125,21 @@ class Variable(object):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
                 import numpy as np
+                import paddle
+                paddle.disable_static()
 
                 x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    inputs2 = []
-                    for _ in range(10):
-                        tmp = fluid.dygraph.base.to_variable(x)
-                        # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
-                        # there is no one need gradient on it.
-                        tmp.stop_gradient=False
-                        inputs2.append(tmp)
-                    ret2 = fluid.layers.sums(inputs2)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                inputs = []
+                for _ in range(10):
+                    tmp = paddle.to_tensor(x)
+                    # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
+                    # there is no one need gradient on it.
+                    tmp.stop_gradient=False
+                    inputs.append(tmp)
+                ret = paddle.sums(inputs)
+                loss = paddle.reduce_sum(ret)
+                loss.backward()
 
         """
         pass
@@ -1170,9 +1171,7 @@ class Variable(object):
                         inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
 
                 # example2: return tuple of ndarray
@@ -1218,9 +1217,7 @@ class Variable(object):
                         inputs2.append(tmp)
                     ret2 = fluid.layers.sums(inputs2)
                     loss2 = fluid.layers.reduce_sum(ret2)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = True
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     print(loss2.gradient())
                     loss2.clear_gradient()
                     print("After clear {}".format(loss2.gradient()))
diff --git a/python/paddle/fluid/generator.py b/python/paddle/fluid/generator.py
index e11b2e484dce1dd4260b3052d0f0a58f3cfc420a..98924f801413bcd822a0d9a6fd61adcc4d00fddc 100644
--- a/python/paddle/fluid/generator.py
+++ b/python/paddle/fluid/generator.py
@@ -17,44 +17,28 @@ from . import core
 
 __all__ = ['Generator']
 
-default_rng_seed_val = 34342423252
 
-
-class Generator(object):
+class Generator(core.Generator):
     """Generator class"""
 
-    def __init__(self, device="CPU"):
-        """init"""
-        self.device = device
-        seed_in = default_rng_seed_val
-        if self.device == "CPU":
-            self.generator = core.Generator()
-            # self.generator.manual_seed(seed_in)
-        else:
-            raise ValueError(
-                "generator class with device %s does not exist, currently only support generator with device 'CPU' "
-                % device)
-
-    def get_state(self):
-        return self.generator.get_state()
-
-    def set_state(self, state):
-        self.generator.set_state(state)
+    def __init__(self, place=None):
+        """
+        Create a generator object which manages the random number generation. ( Experimental Feature )
 
-    def manual_seed(self, seed):
-        self.generator.manual_seed(seed)
+        Parameters:
+            place(CPUPlace|CUDAPinnedPlace|CUDAPlace, optional): The place to allocate Tensor. Can be  
+                CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place.
 
-    def seed(self):
-        return self.generator.seed()
+        Returns:
+            Generator: A generator object.
 
-    def initial_seed(self):
-        return self.generator.initial_seed()
-
-    def random(self):
-        return self.generator.random()
-
-    def get_cpu_engine(self):
-        return self.generator.get_cpu_engine()
-
-    def set_cpu_engine(self, cpu_engine):
-        self.generator.set_cpu_engine(cpu_engine)
+        """
+        self.place = place
+        if not place:
+            place = core.CPUPlace()
+        if isinstance(place, core.CPUPlace):
+            super(Generator, self).__init__()
+        else:
+            raise ValueError(
+                "Generator class with %s does is not supported yet, currently only support generator with CPUPlace "
+                % place)
diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
index f885e51ef7f0d82ca50c7beb6ee6cd443dfc61d4..40cc2d2dd4e3823796451e5f335b7c4e765d5908 100644
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@@ -145,7 +145,7 @@ class Fleet(object):
 
         Returns:
             bool: True if this is a node of server,
-                  False if not.
+                  False if not
         """
         return self._role_maker.is_server()
 
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
index 7f8db694d3601be072ab30ffbbd345b25ffafd80..be27a7c5214e6b4b730d14cb4a64118f24506860 100644
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -343,7 +343,6 @@ class MPISymetricRoleMaker(MPIRoleMaker):
     def get_pserver_endpoints(self):
         """
         get pserver endpoints
-        
         Returns:
             endpoints(list): pserver endpoints
         """
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
index 1a7a82fbfac19b41e8b96c231ca74398f6b2214c..236cb458be4c6a07f768761b41464e64d4d53f77 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -467,7 +467,7 @@ class FleetTranspiler(Fleet):
         opts = public._get_optimize_ops(self._origin_main_program)
         for op in opts:
             if "Param" in op.input_names and \
-                            "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
+                    "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
                 return op
 
     def _save_dense_params(self, executor, dirname, context, main_program):
@@ -700,8 +700,8 @@ if you would like to save all variables in a
                 return False
 
             if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                            var.desc.type() == core.VarDesc.VarType.READER:
+                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                    var.desc.type() == core.VarDesc.VarType.READER:
                 return False
             return var.persistable
 
@@ -846,4 +846,4 @@ class ParameterServerOptimizer(DistributedOptimizer):
         fleet.compiled_config = compiled_config
         fleet.main_program, fleet.startup_program = \
             self._build_trainer_programs(compiled_config) if fleet.is_worker() \
-                else self._build_pserver_programs(compiled_config)
+            else self._build_pserver_programs(compiled_config)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8668e39bd4e2e9724d79352f805aa6e6d68e5c4
--- /dev/null
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import warnings
+
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+
+from paddle.fluid.transpiler.details.program_utils import delete_ops
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import find_heter_ops
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import create_heter_program
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import create_trainer_program
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import find_block_joints
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import find_op_input_output
+from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import get_vars_name_in_block
+
+
+def split_heter_worker_ops_pass(program, config):
+    """
+    split heter worker program from origin-program
+    1. find heter op (located on different device)
+    2. find input&output of every heter-block
+    3. create heter worker program, add listen&serv op
+    """
+    default_deveice = "cpu"
+    program, heter_ops, _, program_block_ops = find_heter_ops(program,
+                                                              default_deveice)
+    if len(heter_ops) == 0:
+        warnings.warn(
+            "Currently running in Heter Parameter Server mode, but no OP running on heterogeneous devices, Please check your code."
+        )
+        return program
+
+    current_device = "gpu"
+    if current_device not in heter_ops:
+        raise ValueError("Op which run on device {} not exist.".format(
+            current_device))
+
+    block_vars_detail = find_block_joints(program, program_block_ops, heter_ops)
+    heter_program = framework.Program()
+    create_heter_program(program, config, heter_program, heter_ops,
+                         block_vars_detail, current_device)
+    return heter_program
+
+
+def split_trainer_ops_pass(program, config):
+    """
+    split cpu-trainer program from origin-program
+    1. find heter op (located on different device)
+    2. find input&output of every heter-block
+    3. create cpu-trainer program, add send&recv op 
+    """
+    # Todo: support user define default_device (MrChengmo)
+    default_deveice = "cpu"
+    program, heter_ops, _, program_block_ops = find_heter_ops(program,
+                                                              default_deveice)
+    block_vars_detail = find_block_joints(program, program_block_ops, heter_ops)
+    create_trainer_program(program, config, heter_ops, block_vars_detail)
+    return program
+
+
+def delete_startup_useless_ops_var_pass(startup_program, main_program, config):
+    """
+    delete variable which not used in current main_program
+    """
+    # find all op and its var
+    vars_in_main_program = get_vars_name_in_block(main_program.global_block())
+
+    block_nums = startup_program.num_blocks
+    for block_index in range(1, block_nums):
+        current_block = startup_program.block(block_index)
+        # delete useless op
+        need_delete_op = []
+        for op in current_block.ops:
+            inputs, outputs = find_op_input_output(startup_program,
+                                                   current_block, op)
+            inputs += outputs
+            # Todo: delete some concat op
+            if list(set(inputs) & set(vars_in_main_program)) == None:
+                need_delete_op.append(op)
+        delete_ops(current_block, need_delete_op)
+
+        # delete useless var
+        for var in current_block.vars:
+            if var.name not in vars_in_main_program:
+                startup_program._remove_var(var.name)
+
+    return startup_program
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
index 765c18283b49ad956ec34b2c1eefbb4dbcefe85a..05deff10a2e1c914e9725c7d8697a704db6e7e42 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
@@ -37,7 +37,7 @@ LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
 
 def _is_optimizer_op(op):
     if "Param" in op.input_names and \
-                    "LearningRate" in op.input_names:
+            "LearningRate" in op.input_names:
         return True
     return False
 
@@ -49,7 +49,7 @@ def _same_or_split_var(p_name, var_name):
 def _get_optimizer_input_shape(op_type, varkey, orig_shape, param_shape):
     """
     Returns the shape for optimizer inputs that need to be reshaped when
-    Param and Grad is split to multiple servers.
+    Param and Grad is split to multiple servers. 
     """
     # HACK(typhoonzero) : Should use functions of corresponding optimizer in
     # optimizer.py to get the shape, do not bind this in the transpiler.
@@ -542,7 +542,7 @@ def add_optimizer_pass(program, config):
             for _, op in enumerate(optimize_ops):
                 # optimizer is connected to itself
                 if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \
-                                op not in global_ops:
+                        op not in global_ops:
                     __append_optimize_op__(op, per_opt_block, grad_to_block_id,
                                            merged_var, lr_ops)
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index f9889997d9e38c98c4a736a62dbc72da7029f337..378c8fc23d7528766ca9eca062c87a4511e32b46 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -12,33 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Copyright(c) 2020 PaddlePaddle Authors.All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0(the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http:  // www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from __future__ import print_function
 from functools import reduce
 
 import collections
 import math
 import os
+import warnings
 
 import six
+import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.core import CommContext
+import paddle.fluid.framework as framework
 from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
 from paddle.fluid.incubate.fleet.parameter_server.ir import vars_metatools
 from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import RoundRobin, PSDispatcher
+from paddle.fluid.transpiler.details.program_utils import delete_ops
 
 OP_NAME_SCOPE = "op_namescope"
 CLIP_OP_NAME_SCOPE = "@CLIP"
@@ -58,8 +48,8 @@ def _get_lr_ops(program):
     for index, op in enumerate(program.global_block().ops):
         role_id = int(op.attr(RPC_OP_ROLE_ATTR_NAME))
         if role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) or \
-                        role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) | \
-                        int(OPT_OP_ROLE_ATTR_VALUE):
+                role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) | \
+                int(OPT_OP_ROLE_ATTR_VALUE):
             lr_ops.append(op)
     return lr_ops
 
@@ -122,9 +112,20 @@ class MergedVariable:
         self.offsets = offsets
 
 
+def Singleton(cls):
+    _instance = {}
+
+    def _singleton(*args, **kargs):
+        if cls not in _instance:
+            _instance[cls] = cls(*args, **kargs)
+        return _instance[cls]
+
+    return _singleton
+
+
+@Singleton
 class CompileTimeStrategy(object):
     def __init__(self, main_program, startup_program, strategy, role_maker):
-
         self.min_block_size = 8192
 
         self.origin_main_program = main_program
@@ -177,6 +178,12 @@ class CompileTimeStrategy(object):
     def get_ps_endpoints(self):
         return self.role_maker.get_pserver_endpoints()
 
+    def get_heter_worker_endpoints(self):
+        return self.role_maker._get_heter_worker_endpoints()
+
+    def get_heter_worker_endpoint(self):
+        return self.role_maker._get_heter_worker_endpoint()
+
     def get_origin_programs(self):
         return self.origin_main_program, self.origin_startup_program
 
@@ -810,6 +817,30 @@ class CompileTimeStrategy(object):
 
         return sparse_param_grads, dense_param_grads
 
+    def remove_var_pair_by_grad(self, var_name):
+
+        for index, pair in enumerate(self.merged_variables_pairs):
+            var = pair[0]
+            var_grad = pair[1]
+            if var_grad.merged_var.name == var_name:
+                del self.merged_variables_pairs[index]
+
+        for index, pair in enumerate(self.merged_dense_pairs):
+            var = pair[0]
+            var_grad = pair[1]
+            if var_grad.merged_var.name == var_name:
+                del self.merged_dense_pairs[index]
+                return
+
+        for index, pair in enumerate(self.merged_sparse_pairs):
+            var = pair[0]
+            var_grad = pair[1]
+            if var_grad.merged_var.name == var_name:
+                del self.merged_sparse_pairs[index]
+                return
+
+        print("Not find {} in self.merge_pairs".format(var_name))
+
 
 def _is_opt_role_op(op):
     # NOTE : depend on oprole to find out whether this op is for
@@ -817,7 +848,7 @@ def _is_opt_role_op(op):
     op_maker = core.op_proto_and_checker_maker
     optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
     if op_maker.kOpRoleAttrName() in op.attr_names and \
-                    int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+            int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
         return True
     return False
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 912eee0df0a6f9821066dc5c0285ea27c7e52874..201b3863a4b6d6d5fed036d85b2103f5defe61f0 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 
 from __future__ import print_function
+import six
+import collections
+import warnings
+import math
 
+from functools import reduce
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
 
@@ -34,6 +40,10 @@ LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
 OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
 op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
+DEVICE_LIST = ["cpu", "gpu", "xpu"]
+COMMUNICATE_OPS_TYPE = ["send", "recv", "fetch_barrier", "send_barrier"]
+DEFAULT_DEVICE = 'cpu'
+
 
 def delete_optimizer_pass(program, config):
     def _delete_optimizer_op_and_vars(_program, optimize_ops):
@@ -250,7 +260,7 @@ def fake_init_ops_pass(program, config):
         return list(set(dist_varnames + sparse_varnames))
 
     def _fake_init_sparsetable(sparse_table_names):
-        #delete table init op
+        # delete table init op
         for table_name in sparse_table_names:
             table_var = program.global_block().vars[table_name]
             table_param_init_op = []
@@ -307,3 +317,871 @@ def delet_extra_optimizes_pass(program, config):
             program.global_block()._remove_var(var)
 
     return program
+
+
+def find_heter_ops(program, default_device="cpu"):
+    if default_device not in DEVICE_LIST:
+        raise ValueError("Given device {} is not in device list {}".format(
+            default_device, DEVICE_LIST))
+
+    def _is_heter_op(op, current_heter_device, default_device="cpu"):
+        heter_devices = list(DEVICE_LIST)
+        heter_devices.remove(default_device)
+        op_device = op.attr("op_device")
+        op_type = op.type
+        if op_device in heter_devices:
+            return True
+        elif op_type in COMMUNICATE_OPS_TYPE and current_heter_device != default_device:
+            # for distributed communciate ops: send & recv & barrier etc.
+            # Todo: need update this method
+            op._set_attr('op_device', current_heter_device)
+            return True
+        elif op_device == None or op_device == default_device:
+            op._set_attr('op_device', default_device)
+            return False
+        return False
+
+    def _is_same_device(op, pre_device, default_device="cpu"):
+        op_device = op.attr("op_device")
+        if op_device == pre_device:
+            return True
+        if pre_device == default_device:
+            return True
+        return False
+
+    def _append_heter_op(op, current_heter_block_ops, heter_ops):
+        op_device = op.attr("op_device")
+        if op_device not in heter_ops:
+            heter_ops[op_device] = {}
+        current_heter_block_ops.append(op)
+
+    origin_porgram = program.clone()
+    block = program.global_block()
+
+    program_block_ops = []
+    default_ops = {default_device: {}}
+    heter_ops = {}
+    block_index = 0
+    # heter_ops: {"gpu": {1:[op1, op2, ...], 2:[op1, op2, ...] }; "xpu": {3:[op1, op2, ...], 4:[op1, op2, ...] }}
+
+    current_heter_block_ops = []
+    current_default_block_ops = []
+    current_heter_device = default_device
+    is_heter = False
+    for op in block.ops:
+        if _is_heter_op(op, current_heter_device, default_device):
+            # for gpu/xpu-op
+            is_heter = True
+
+            # for cpu-op block append
+            if len(current_default_block_ops) > 1:
+                default_ops[default_device][
+                    block_index] = current_default_block_ops
+                program_block_ops.append(current_default_block_ops)
+                current_default_block_ops = []
+                block_index += 1
+
+            if _is_same_device(op, current_heter_device, default_device):
+                # for gpu-op, gpu-op -> gpu-op,...
+                current_heter_device = op.attr("op_device")
+                _append_heter_op(op, current_heter_block_ops, heter_ops)
+            else:
+                # for gpu-op -> xpu-op, ...
+                op_device = current_heter_block_ops[0].attr("op_device")
+                heter_ops[op_device][block_index] = current_heter_block_ops
+                program_block_ops.append(current_heter_block_ops)
+                block_index += 1
+                current_heter_block_ops = []
+                current_heter_device = op.attr("op_device")
+                _append_heter_op(op, current_heter_block_ops, heter_ops)
+
+        elif is_heter:
+            # for gpu/xpu-op -> cpu-op
+            op_device = current_heter_block_ops[0].attr("op_device")
+            heter_ops[op_device][block_index] = current_heter_block_ops
+            program_block_ops.append(current_heter_block_ops)
+            block_index += 1
+            current_heter_block_ops = []
+            current_heter_device = default_device
+            is_heter = False
+            current_default_block_ops.append(op)
+        else:
+            # for cpu-op
+            current_default_block_ops.append(op)
+
+    if current_default_block_ops != []:
+        default_ops[default_device][block_index] = current_default_block_ops
+        program_block_ops.append(current_default_block_ops)
+
+    if current_heter_block_ops != []:
+        op_device = current_heter_block_ops[0].attr("op_device")
+        heter_ops[op_device][block_index] = current_heter_block_ops
+        program_block_ops.append(current_heter_block_ops)
+
+    if len(heter_ops) == 0:
+        warnings.warn(
+            "No heterogeneous OP was found in your program , "
+            " please using fluid.device_guard() to run OPs on different device.")
+
+    total_heter_ops = 0
+    heter_blocks = 0
+    for device in heter_ops.keys():
+        heter_block_dict = heter_ops[device]
+        heter_blocks += len(heter_block_dict)
+        for _, heter_block in heter_block_dict.items():
+            total_heter_ops += len(heter_block)
+    print(
+        "There are {} OPs in your main_program, and contains {} heter-OPs which is made up of {} heter-blocks.".
+        format(len(block.ops), total_heter_ops, heter_blocks))
+    return origin_porgram, heter_ops, default_ops, program_block_ops
+
+
+def create_heter_program(program, config, heter_program, heter_ops,
+                         block_var_detail, current_device):
+    # add heter op
+    optimizer_block = []
+    grad_to_block_id = []
+    send_grad_var_list = []
+
+    pre_block_idx = heter_program.num_blocks - 1
+    for index, heter_block_ops in heter_ops[current_device].items():
+        heter_block = heter_program._create_block(pre_block_idx)
+        optimizer_block.append(heter_block)
+        for _, op in enumerate(heter_block_ops):
+            block_append_op(heter_program, program, heter_block, op)
+
+            # add relate variables
+            inputs = _get_input_map_from_op(program.global_block().vars, op)
+            add_vars_by_op_map(inputs, heter_program)
+
+            outputs = _get_output_map_from_op(program.global_block().vars, op)
+            add_vars_by_op_map(outputs, heter_program)
+
+        entrance_vars = block_var_detail[index]["entrance"]
+        add_vars_by_var_list(entrance_vars, program, heter_program)
+        exit_vars = block_var_detail[index]["exit"]
+        add_vars_by_var_list(exit_vars, program, heter_program)
+
+        comm_info = get_communicate_var_info(program, index, entrance_vars,
+                                             exit_vars)
+
+        grad_to_block_id.append(comm_info["block_input_var_name"] + ":" + str(
+            heter_block.idx))
+
+        # create slice op
+        first_op_index = 0
+
+        get_type_var_name = comm_info["input_var_reshape_name"][0].split(
+            ".input_reshape@Heter")[0]
+        get_type_var = heter_program.global_block().vars[get_type_var_name]
+
+        insert_recv_slice_op(
+            heter_program, heter_block, first_op_index,
+            comm_info["block_input_var_name"],
+            (-1, sum(comm_info["input_var_reshape_dim"])), get_type_var.dtype,
+            get_type_var.type, comm_info["input_var_reshape_name"], [
+                (-1, comm_info["input_var_reshape_dim"][i])
+                for i in range(len(comm_info["input_var_reshape_dim"]))
+            ])
+        first_op_index += len(comm_info["input_var_reshape_dim"])
+        # create reshape op
+        for i in range(len(comm_info["input_var_reshape_name"])):
+            var_name = entrance_vars[i]
+            insert_reshape_op(
+                heter_program,
+                heter_block,
+                first_op_index,
+                comm_info["input_var_reshape_name"][i],
+                var_name, )
+            first_op_index += 1
+
+        first_op_index = len(heter_block.ops)
+
+        # create send reshape op
+        for i in range(len(exit_vars)):
+            insert_reshape_op(heter_program, heter_block, first_op_index,
+                              exit_vars[i],
+                              comm_info["output_var_reshape_name"][i],
+                              [-1, comm_info["output_var_reshape_dim"][i]])
+            first_op_index += 1
+
+        # create send concat op
+        insert_send_concat_op(heter_program, heter_block, first_op_index,
+                              comm_info["output_var_reshape_name"],
+                              comm_info["block_output_var_name"],
+                              [-1, sum(comm_info["output_var_reshape_dim"])])
+        check_op_device(heter_block, current_device)
+        send_grad_var_list = send_grad_var_list + add_heter_send_op(
+            program, heter_program, heter_block, block_var_detail[index])
+
+    # add step conter
+    send_input_vars = []
+    dummy_output = []
+    trainer_id = config.get_role_id()
+    pserver_endpoints = config.get_ps_endpoints()
+    optimizer_block[-1].append_op(
+        type="send",
+        inputs={"X": send_input_vars},
+        outputs={"Out": dummy_output},
+        attrs={
+            "send_varnames": [STEP_COUNTER],
+            "merge_add": True,
+            "use_send_handler": False,
+            "endpoints": pserver_endpoints
+        })
+
+    # add info in listen&serv
+    attrs = {
+        "grad_to_block_id": grad_to_block_id,
+        "sparse_grad_to_param": None,
+        "lr_decay_block_id": None,
+        "dense_optimize_blocks": None,
+        "sparse_optimize_blocks": None,
+        "optimize_blocks": optimizer_block,
+
+        # runtime attribute
+        "endpoint": config.get_heter_worker_endpoint(),
+        "pserver_id": config.get_role_id(),
+        "Fanin": config.get_trainers(),
+        "distributed_mode": config.get_distributed_mode(),
+        "rpc_get_thread_num": 12,
+        "rpc_send_thread_num": 12,
+        "rpc_prefetch_thread_num": 12
+    }
+
+    # append the listen_and_serv op
+    heter_program.global_block().append_op(
+        type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs)
+
+    check_heter_compile_time_strategy(program, config, send_grad_var_list)
+
+
+def check_heter_compile_time_strategy(program, config, send_grad_var_list):
+    origin_grad_var_list = []
+    for _, var_grad in config.merged_variables_pairs:
+        origin_grad_var_list.append(var_grad.merged_var.name)
+
+    origin_grad_var_list = list(set(origin_grad_var_list))
+    send_grad_var_list = list(set(send_grad_var_list))
+    useless_grad_var_list = list(
+        set(origin_grad_var_list) - set(send_grad_var_list))
+
+    for useless_grad_var in useless_grad_var_list:
+        config.remove_var_pair_by_grad(useless_grad_var)
+
+
+def create_trainer_program(program, config, heter_ops, block_var_detail):
+    for device in heter_ops.keys():
+        for heter_block_index in sorted(heter_ops[device]):
+            replace_ops_by_communicate_op(program, config, heter_block_index,
+                                          heter_ops[device][heter_block_index],
+                                          block_var_detail)
+            remove_trainer_send_op(program, config, heter_block_index,
+                                   block_var_detail)
+    deleter_trainer_useless_var(program)
+    check_op_device(program.global_block(), DEFAULT_DEVICE)
+
+
+def replace_ops_by_communicate_op(program, config, heter_block_index, ops_list,
+                                  block_var_detail):
+    all_op = program.global_block().ops
+    start_op = ops_list[0]
+    first_op_idx = -1
+    for op in all_op:
+        if is_same_op(op, start_op):
+            first_op_idx = all_op.index(op)
+            break
+    assert first_op_idx != -1
+    delete_same_ops(program.global_block(), ops_list)
+
+    mode = config.get_distributed_mode()
+    heter_worker_endpoint = config.get_heter_worker_endpoint()
+    entrance_var = block_var_detail[heter_block_index]["entrance"]
+    exit_var = block_var_detail[heter_block_index]["exit"]
+
+    default_device_comm_info = get_communicate_var_info(
+        program, heter_block_index - 1,
+        block_var_detail[heter_block_index - 1]["entrance"],
+        block_var_detail[heter_block_index - 1]["exit"])
+    comm_info = get_communicate_var_info(program, heter_block_index,
+                                         entrance_var, exit_var)
+
+    # create reshape op
+    for i in range(len(entrance_var)):
+        insert_reshape_op(
+            program,
+            program.global_block(), first_op_idx, entrance_var[i],
+            default_device_comm_info["output_var_reshape_name"][i],
+            [-1, default_device_comm_info["output_var_reshape_dim"][i]])
+        first_op_idx += 1
+
+    # create concat op
+    insert_send_concat_op(
+        program,
+        program.global_block(), first_op_idx,
+        default_device_comm_info["output_var_reshape_name"],
+        default_device_comm_info["block_output_var_name"],
+        [-1, sum(default_device_comm_info["output_var_reshape_dim"])])
+    first_op_idx += 1
+
+    # create send op
+    send_input_vars = [
+        program.global_block().vars[default_device_comm_info[
+            "block_output_var_name"]]
+    ]
+
+    get_type_var_name = comm_info["output_var_reshape_name"][0].split(
+        ".output_reshape@Heter")[0]
+    get_type_var = program.global_block().vars[get_type_var_name]
+
+    program.global_block().create_var(
+        name=comm_info["block_output_var_name"],
+        shape=(-1, sum(comm_info["output_var_reshape_dim"])),
+        dtype=get_type_var.dtype,
+        type=get_type_var.type)
+
+    recv_vars = [
+        program.global_block().vars[comm_info["block_output_var_name"]]
+    ]
+
+    program.global_block()._insert_op(
+        index=first_op_idx,
+        type="send_and_recv",
+        inputs={"X": send_input_vars},
+        outputs={"Out": recv_vars},
+        attrs={
+            "send_var_name": default_device_comm_info["block_output_var_name"],
+            "recv_var_name": comm_info["block_output_var_name"],
+            "endpoint": heter_worker_endpoint,
+            "trainer_id": config.get_role_id(),
+            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+        })
+    first_op_idx += 1
+
+    # recv
+    # create slice op
+    insert_recv_slice_op(
+        program,
+        program.global_block(), first_op_idx,
+        comm_info["block_output_var_name"],
+        (-1, sum(comm_info["output_var_reshape_dim"])), get_type_var.dtype,
+        get_type_var.type, comm_info["output_var_reshape_name"], [
+            (-1, comm_info["output_var_reshape_dim"][i])
+            for i in range(len(comm_info["output_var_reshape_dim"]))
+        ])
+
+    first_op_idx += len(comm_info["output_var_reshape_dim"])
+
+    # create reshape op
+    for i in range(len(comm_info["output_var_reshape_name"])):
+        var_name = comm_info["output_var_reshape_name"][i].split(
+            ".output_reshape@Heter")[0]
+        insert_reshape_op(
+            program,
+            program.global_block(),
+            first_op_idx,
+            comm_info["output_var_reshape_name"][i],
+            var_name, )
+        first_op_idx += 1
+
+
+def remove_trainer_send_op(program, config, heter_block_index,
+                           block_var_detaile):
+    # if trainer do FF->BP->SEND, it has follow vars: var, var@GRAD
+    # if trainer only do SEND, it has one var: var@GRAD
+    # Delete Send op ,if trainer doesn't has pair var (var<->var@GRAD)
+    persistables = block_var_detaile[heter_block_index]["persistables"]
+    need_remove_send_op = []
+    need_remove_grad_var = []
+    for op in find_send_op(program):
+        input_list, _ = find_op_input_output(program,
+                                             program.global_block(), op)
+        for var_name in input_list:
+            origin_var_name = var_name.split("@GRAD")[0]
+            if origin_var_name in persistables:
+                need_remove_send_op.append(op)
+                need_remove_grad_var.append(var_name)
+    need_remove_send_op = list(set(need_remove_send_op))
+    delete_ops(program.global_block(), need_remove_send_op)
+    for grad_var_name in need_remove_grad_var:
+        config.remove_var_pair_by_grad(grad_var_name)
+
+
+def add_heter_send_op(program, heter_program, block, block_var_detail):
+    def _get_send_op_dict():
+        send_op_dict = {}
+        send_op_list = find_send_op(program)
+        for op in send_op_list:
+            input_list, _ = find_op_input_output(program,
+                                                 program.global_block(), op)
+            for var in input_list:
+                send_op_dict[var] = op
+        return send_op_dict
+
+    send_grad_var_list = []
+    send_op_dict = _get_send_op_dict()
+    for persistable_var in block_var_detail["persistables"]:
+        # check var_name ==  var@GRAD
+        if "@GRAD" not in persistable_var:
+            continue
+        if "GRAD" != persistable_var.split("@")[-1]:
+            continue
+        if persistable_var not in send_op_dict:
+            continue
+        block_append_op(program, heter_program, block,
+                        send_op_dict[persistable_var])
+        send_grad_var_list.append(persistable_var)
+    return send_grad_var_list
+
+
+def find_send_op(program):
+    send_op_list = []
+    for op in program.global_block().ops:
+        if op.type == "send":
+            send_op_list.append(op)
+    return send_op_list
+
+
+def get_communicate_var_info(program, block_index, entrance_var_list,
+                             exit_var_list):
+    input_var_reshape_dim = []
+    input_var_reshape_name = []
+    block_input_var_name = "joint_{}_{}@Heter".format(block_index - 1,
+                                                      block_index)
+    output_var_reshape_dim = []
+    output_var_reshape_name = []
+    block_output_var_name = "joint_{}_{}@Heter".format(block_index,
+                                                       block_index + 1)
+    entrance_var_list.sort()
+    exit_var_list.sort()
+    # input
+    # Heter_SERVER_BLOCK_index@JOINT_VAR -> slice -> var@Heter_SERVER_BLOCK@INPUT_RESHAPE_VAR -> reshape -> var
+    for name in entrance_var_list:
+        var = program.global_block().vars[name]
+        shape = var.shape
+        if len(shape) < 2 or shape[0] != -1:
+            raise ValueError(
+                "Variable {} not support heter training. its shape is {}".
+                format(name, shape))
+        recv_var_dim = -1 * reduce(lambda x, y: x * y, shape)
+        input_var_reshape_dim.append(recv_var_dim)
+        input_var_reshape_name.append("{}.input_reshape@Heter".format(name))
+
+    # output
+    # var -> reshape -> var@Heter_SERVER_BLOCK@INPUT_RESHAPE_VAR -> concat -> Heter_SERVER_BLOCK_index@JOINT_VAR
+    for var_name in exit_var_list:
+        var = program.global_block().vars[var_name]
+        shape = var.shape
+        if len(shape) < 2 or shape[0] != -1:
+            raise ValueError(
+                "Variable {} not support heter training. its shape is {}".
+                format(var_name, shape))
+        send_reshape_dim = -1 * reduce(lambda x, y: x * y, shape)
+        output_var_reshape_dim.append(send_reshape_dim)
+        output_var_reshape_name.append("{}.output_reshape@Heter".format(
+            var_name))
+
+    info = {
+        "input_var_reshape_dim": input_var_reshape_dim,
+        "input_var_reshape_name": input_var_reshape_name,
+        "block_input_var_name": block_input_var_name,
+        "output_var_reshape_dim": output_var_reshape_dim,
+        "output_var_reshape_name": output_var_reshape_name,
+        "block_output_var_name": block_output_var_name
+    }
+
+    return info
+
+
+def find_block_joints(program, program_block_ops_list, heter_ops):
+    block_var_detail = find_entrance_exit_private(program,
+                                                  program_block_ops_list)
+    block_var_detail = entrance_exit_check(program, program_block_ops_list,
+                                           block_var_detail, heter_ops)
+    block_var_detail = delete_block_useless_exit(
+        program, program_block_ops_list, block_var_detail)
+    return block_var_detail
+
+
+def find_entrance_exit_private(program, program_block_ops_list):
+    block_var_detail = []
+    persistables = []
+    for index, block_op_list in enumerate(program_block_ops_list):
+        block_input, block_output = find_ops_list_input_output(program,
+                                                               block_op_list)
+        persistables = screen_persistables(
+            program, block_input) + screen_persistables(program, block_output)
+        # find entrance & exit
+        block_private_vars = list(set(block_input) & set(block_output))
+        block_entrance = list(set(block_input) - set(block_private_vars))
+        block_exit = list(set(block_output) - set(block_private_vars))
+        detail = {
+            "entrance": block_entrance,
+            "exit": block_exit,
+            "private": block_private_vars,
+            "persistables": persistables
+        }
+        block_var_detail.append(detail)
+    return block_var_detail
+
+
+def entrance_exit_check(program, program_block_ops_list, block_var_detail,
+                        heter_ops):
+    for index in range(len(block_var_detail) - 1, -1, -1):
+        if index - 1 < 0:
+            break
+        previous_block_exit = block_var_detail[index - 1]["exit"]
+        previous_block_exit.sort()
+        current_block_entrance = block_var_detail[index]["entrance"]
+        current_block_entrance.sort()
+        if previous_block_exit == current_block_entrance:
+            continue
+        exist_vars = list(
+            set(previous_block_exit) & set(current_block_entrance))
+        need_add_vars = list(set(current_block_entrance) - set(exist_vars))
+        need_add_vars = find_need_var_from_previous_block(
+            need_add_vars, block_var_detail, index, heter_ops)
+
+        previous_block_private = block_var_detail[index - 1]["private"]
+        previous_block_entrance = block_var_detail[index - 1]["entrance"]
+        for var in need_add_vars:
+            if var not in previous_block_private and var not in previous_block_entrance:
+                previous_block_entrance.append(var)
+            previous_block_exit.append(var)
+    return block_var_detail
+
+
+def find_need_var_from_previous_block(need_add_vars, block_var_detail,
+                                      current_index, heter_ops):
+    # create index_device_map
+    index_device_map = {}
+    for index in range(len(block_var_detail)):
+        index_device_map[index] = DEFAULT_DEVICE
+    for device in heter_ops:
+        for index in heter_ops[device].keys():
+            index_device_map[index] = device
+
+    pre_index = current_index - 1
+    need_ignore_var = []
+
+    # if need_add_var in current device, no need communicate
+    for var in need_add_vars:
+        while (pre_index >= 0):
+            previous_block_private = block_var_detail[pre_index]["private"]
+            previous_block_exit = block_var_detail[pre_index]["exit"]
+            previous_block_entrance = block_var_detail[pre_index]["entrance"]
+            total_var = previous_block_private + previous_block_exit + previous_block_entrance
+            if var in total_var:
+                if index_device_map[current_index] == index_device_map[
+                        pre_index] and index_device_map[
+                            current_index] == DEFAULT_DEVICE:
+                    need_ignore_var.append(var)
+                    break
+            pre_index -= 1
+
+    need_add_vars = list(set(need_add_vars).difference(set(need_ignore_var)))
+    return need_add_vars
+
+
+def delete_block_useless_exit(program, program_block_ops_list,
+                              block_var_detail):
+    for index in range(len(block_var_detail)):
+        if index == len(block_var_detail) - 1:
+            break
+        current_block_exit = block_var_detail[index]["exit"]
+        next_block_entrance = block_var_detail[index + 1]["entrance"]
+        need_delete_var = []
+        for var in current_block_exit:
+            if var not in next_block_entrance:
+                need_delete_var.append(var)
+
+        for var in need_delete_var:
+            current_block_exit.remove(var)
+
+    return block_var_detail
+
+
+def check_op_device(block, device):
+    for op in block.ops:
+        op._set_attr('op_device', device)
+
+
+def screen_persistables(program, var_list):
+    need_remove = []
+    for var_name in var_list:
+        if "@GRAD" in var_name:
+            origin_var_name = var_name.split("@GRAD")[0]
+            var = program.global_block().vars[origin_var_name]
+        else:
+            var = program.global_block().vars[var_name]
+
+        if fluid.io.is_persistable(var):
+            need_remove.append(var_name)
+
+    for var_name in need_remove:
+        var_list.remove(var_name)
+    return need_remove
+
+
+def insert_reshape_op(program,
+                      block,
+                      index,
+                      var_name,
+                      new_var_name,
+                      new_var_shape=None):
+    input_var = program.global_block().vars[var_name]
+
+    if new_var_name not in program.global_block().vars:
+        out = program.global_block().create_var(
+            name=new_var_name,
+            shape=new_var_shape,
+            dtype=input_var.dtype,
+            type=input_var.type)
+    else:
+        out = program.global_block().vars[new_var_name]
+        new_var_shape = out.shape
+
+    x_shape = program.global_block().create_var(
+        name="{}.xshape@Heter".format(var_name), dtype=input_var.dtype)
+    block._insert_op(
+        index=index,
+        type="reshape2",
+        inputs={"X": input_var},
+        attrs={'shape': new_var_shape},
+        outputs={"Out": out,
+                 "XShape": x_shape})
+
+
+def insert_send_concat_op(program, block, index, var_name_list, new_var_name,
+                          new_var_shape):
+    input_var_list = [
+        program.global_block().vars[var_name] for var_name in var_name_list
+    ]
+
+    out = program.global_block().create_var(
+        name=new_var_name,
+        shape=new_var_shape,
+        dtype=input_var_list[0].dtype,
+        type=input_var_list[0].type)
+
+    block._insert_op(
+        index=index,
+        type='concat',
+        inputs={"X": input_var_list},
+        outputs={'Out': [out]},
+        attrs={'axis': -1,
+               'use_stack': False})
+
+
+def insert_recv_slice_op(program, block, index, var_name, var_shape, dtype,
+                         type, new_var_name_list, new_var_shape_list):
+
+    if var_name not in program.global_block().vars:
+        input_var = program.global_block().create_var(
+            name=var_name, shape=var_shape, dtype=dtype, type=type)
+    else:
+        input_var = program.global_block().vars[var_name]
+
+    out_list = []
+    for i in range(len(new_var_name_list)):
+        if new_var_name_list[i] not in program.global_block().vars:
+            out = program.global_block().create_var(
+                name=new_var_name_list[i],
+                shape=new_var_shape_list[i],
+                dtype=input_var.dtype,
+                type=input_var.type)
+        else:
+            out = program.global_block().vars[new_var_name_list[i]]
+        out_list.append(out)
+
+    start_index = 0
+    end_index = 0
+    for i in range(len(new_var_name_list)):
+        starts = []
+        ends = []
+        attrs = {'axes': [1]}
+        end_index += new_var_shape_list[i][1]
+        starts.append(start_index)
+        ends.append(end_index)
+        attrs['starts'] = starts
+        attrs['ends'] = ends
+
+        block._insert_op(
+            index=index,
+            type='slice',
+            inputs={'Input': input_var},
+            attrs=attrs,
+            outputs={'Out': out_list[i]})
+        start_index = end_index
+        index += 1
+
+
+def deleter_trainer_useless_var(program):
+    porgram_useful_var_list = []
+    for op in program.global_block().ops:
+        input_var_list, output_var_list = find_op_input_output(
+            program, program.global_block(), op)
+        op_var_list = list(set(input_var_list).union(set(output_var_list)))
+        porgram_useful_var_list = list(
+            set(porgram_useful_var_list).union(set(op_var_list)))
+
+    program_useless_var_list = list(
+        set(get_vars_name_in_block(program.global_block())).difference(
+            set(porgram_useful_var_list)))
+    for var in program_useless_var_list:
+        program.global_block()._remove_var(var)
+    return program_useless_var_list
+
+
+def block_append_op(program, origin_program, block, op):
+    inputs = _get_input_map_from_op(origin_program.global_block().vars, op)
+    for key, varlist in six.iteritems(inputs):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for var in varlist:
+            if var.name not in program.global_block().vars:
+                program.global_block()._clone_variable(var)
+
+    outputs = _get_output_map_from_op(origin_program.global_block().vars, op)
+    for key, varlist in six.iteritems(outputs):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for var in varlist:
+            if var.name not in program.global_block().vars:
+                program.global_block()._clone_variable(var)
+
+    if "_grad" not in op.type:
+        # for forward op
+        return block.append_op(
+            type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
+    else:
+        # for grad op
+        op_desc = op.desc
+        op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+        backward = core.op_proto_and_checker_maker.OpRole.Backward
+        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
+
+        # append grad op
+        new_op_desc = block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+        new_op_desc._set_attr(op_role_attr_name, backward)
+
+        # set device gard
+        if op.desc.has_attr(device_attr_name):
+            op_device = op_desc.attr(device_attr_name)
+            new_op_desc._set_attr(device_attr_name, op_device)
+        block._sync_with_cpp()
+
+
+def add_vars_by_op_map(var_map, program):
+    for key, varlist in six.iteritems(var_map):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for i in range(len(varlist)):
+            var = varlist[i]
+            if var.name not in program.global_block().vars:
+                program.global_block()._clone_variable(var)
+
+
+def add_vars_by_var_list(var_name_list, origin_program, program):
+    for var_name in var_name_list:
+        if var_name not in program.global_block().vars:
+            var = origin_program.global_block().vars[var_name]
+            program.global_block()._clone_variable(var)
+
+
+def get_varlist_from_op_map(var_map):
+    var_list = []
+    for key, varlist in six.iteritems(var_map):
+        if not isinstance(varlist, list):
+            varlist = [varlist]
+        for i in range(len(varlist)):
+            var = varlist[i]
+            var_list.append(var.name)
+    return var_list
+
+
+def find_ops_list_input_output(program, ops_list):
+    input_var_list = []
+    output_var_list = []
+    for op in ops_list:
+        inputs = _get_input_map_from_op(program.global_block().vars, op)
+        input_var_list += get_varlist_from_op_map(inputs)
+        outputs = _get_output_map_from_op(program.global_block().vars, op)
+        output_var_list += get_varlist_from_op_map(outputs)
+
+    input_var_list = list(set(input_var_list))
+    output_var_list = list(set(output_var_list))
+    return input_var_list, output_var_list
+
+
+def find_op_input_output(program, block, op):
+    input_var_list = []
+    output_var_list = []
+    inputs = _get_input_map_from_op(block.vars, op)
+    input_var_list += get_varlist_from_op_map(inputs)
+    outputs = _get_output_map_from_op(block.vars, op)
+    output_var_list += get_varlist_from_op_map(outputs)
+    input_var_list = list(set(input_var_list))
+    output_var_list = list(set(output_var_list))
+    return input_var_list, output_var_list
+
+
+def get_vars_name_in_block(block):
+    vars_list = block.vars.keys()
+    vars_name_list = [var_name for var_name in vars_list]
+    return vars_name_list
+
+
+def is_same_op(op1, op2):
+    if str(op1) != str(op2):
+        return False
+    return True
+
+
+def _get_input_map_from_op(varmap, op):
+    """Returns a dict from op input name to the vars in varmap."""
+    iomap = collections.OrderedDict()
+    for key in op.input_names:
+        vars = []
+        for varname in op.input(key):
+            if varname == "@EMPTY@":
+                continue
+            if "lod_tensor_blocking_queue" in varname:
+                continue
+            vars.append(varmap[varname])
+        if len(vars) == 1:
+            iomap[key] = vars[0]
+        else:
+            iomap[key] = vars
+    return iomap
+
+
+def _get_output_map_from_op(varmap, op):
+    """Returns a dict from op output name to the vars in varmap."""
+    iomap = collections.OrderedDict()
+    for key in op.output_names:
+        vars = []
+        for varname in op.output(key):
+            if varname == "@EMPTY@":
+                continue
+            if "lod_tensor_blocking_queue" in varname:
+                continue
+            vars.append(varmap[varname])
+        if len(vars) == 1:
+            iomap[key] = vars[0]
+        else:
+            iomap[key] = vars
+    return iomap
+
+
+def delete_same_ops(block, ops):
+    for op in ops:
+        try:
+            for origin_op in block.ops:
+                if is_same_op(origin_op, op):
+                    idx = list(block.ops).index(origin_op)
+                    block._remove_op(idx)
+                    break
+        except Exception as e:
+            print(e)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index da66503ceee37e30fafa0d5402edd2a188578a0b..39c4df00657daccb88ae1ad95781891c4c6ec11e 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1858,6 +1858,7 @@ def conv3d(input,
     return helper.append_activation(pre_act)
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.pool2d")
 @templatedoc()
 def pool2d(input,
            pool_size=-1,
@@ -2075,6 +2076,7 @@ def pool2d(input,
     return pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.pool3d")
 @templatedoc()
 def pool3d(input,
            pool_size=-1,
@@ -2303,6 +2305,7 @@ def pool3d(input,
     return pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.adaptive_pool2d")
 @templatedoc(op_type="pool2d")
 def adaptive_pool2d(input,
                     pool_size,
@@ -2450,6 +2453,7 @@ def adaptive_pool2d(input,
     return (pool_out, mask) if require_index else pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.adaptive_pool3d")
 @templatedoc(op_type="pool3d")
 def adaptive_pool3d(input,
                     pool_size,
@@ -10205,6 +10209,7 @@ def unstack(x, axis=0, num=None):
     return outs
 
 
+@deprecated(since='2.0.0', update_to="paddle.expand")
 def expand(x, expand_times, name=None):
     """
     :alias_main: paddle.expand
@@ -10312,6 +10317,7 @@ def expand(x, expand_times, name=None):
     return out
 
 
+@deprecated(since='2.0.0', update_to="paddle.expand_as")
 def expand_as(x, target_tensor, name=None):
     """
     :alias_main: paddle.expand_as
@@ -10377,6 +10383,9 @@ def expand_as(x, target_tensor, name=None):
         #(3,20)
 
     """
+    if in_dygraph_mode():
+        return core.ops.expand_as(x, target_tensor)
+
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int32', 'int64', 'bool'], 'expand_as')
     check_variable_and_dtype(target_tensor, 'target_tensor',
@@ -12086,6 +12095,13 @@ Examples:
 
 
 def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
+    if in_dygraph_mode():
+        op = getattr(core.ops, op_name)
+        if binary_op:
+            return op(x, y)
+        else:
+            return op(x)
+
     check_variable_and_dtype(x, "x", ["bool"], op_name)
     if y is not None:
         check_variable_and_dtype(y, "y", ["bool"], op_name)
@@ -12110,28 +12126,27 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
     return out
 
 
-@templatedoc()
 def logical_and(x, y, out=None, name=None):
     """
-    :alias_main: paddle.logical_and
-    :alias: paddle.logical_and, paddle.tensor.logical_and, paddle.tensor.logic.logical_and
-    :old_api: paddle.fluid.layers.logical_and
 
-    ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
 
         out = x \&\& y
 
+    .. note::
+        ``paddle.logical_and`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+
     Args:
-        x(${x_type}): ${x_comment}.
-        y(${y_type}): ${y_comment}.
-        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
+        x (Tensor): the input tensor, it's data type should be bool.
+        y (Tensor): the input tensor, it's data type should be bool.
+        out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        ${out_type}: ${out_comment}
+        N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``.
 
     Examples:
         .. code-block:: python
@@ -12140,43 +12155,38 @@ def logical_and(x, y, out=None, name=None):
             import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True, True, False, False], dtype=np.bool)
+            x_data = np.array([True], dtype=np.bool)
             y_data = np.array([True, False, True, False], dtype=np.bool)
             x = paddle.to_tensor(x_data)
             y = paddle.to_tensor(y_data)
             res = paddle.logical_and(x, y)
-            print(res.numpy()) # [True False False False]
+            print(res.numpy()) # [True False True False]
     """
-    if x.shape != y.shape:
-        raise TypeError(
-            'Input tensors must be same shape, but received x \'s shape: %s, y \'s shape: %s '
-            % (x.shape, y.shape))
     return _logical_op(
         op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True)
 
 
-@templatedoc()
 def logical_or(x, y, out=None, name=None):
     """
-    :alias_main: paddle.logical_or
-    :alias: paddle.logical_or, paddle.tensor.logical_or, paddle.tensor.logic.logical_or
-    :old_api: paddle.fluid.layers.logical_or
 
-    ``logical_or`` operator computes element-wise logical OR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    ``logical_or`` operator computes element-wise logical OR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
 
         out = x || y
 
+    .. note::
+        ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+    
     Args:
-        x(${x_type}): ${x_comment}.
-        y(${y_type}): ${y_comment}.
-        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
+        x (Tensor): the input tensor, it's data type should be bool.
+        y (Tensor): the input tensor, it's data type should be bool.
+        out(Tensor): The ``Variable`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        ${out_type}: ${out_comment}
+        N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``.
 
     Examples:
         .. code-block:: python
@@ -12185,43 +12195,38 @@ def logical_or(x, y, out=None, name=None):
             import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True, True, False, False], dtype=np.bool)
-            y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.to_variable(x_data)
-            y = paddle.to_variable(y_data)
+            x_data = np.array([True, False], dtype=np.bool).reshape(2, 1)
+            y_data = np.array([True, False, True, False], dtype=np.bool).reshape(2, 2)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
             res = paddle.logical_or(x, y)
-            print(res.numpy()) # [True  True  True False]
+            print(res.numpy()) # [[ True  True] [ True False]]
     """
-    if x.shape != y.shape:
-        raise TypeError(
-            'Input tensors must be same shape, but received x \'s shape: %s, y \'s shape: %s '
-            % (x.shape, y.shape))
     return _logical_op(
         op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True)
 
 
-@templatedoc()
 def logical_xor(x, y, out=None, name=None):
     """
-    :alias_main: paddle.logical_xor
-    :alias: paddle.logical_xor, paddle.tensor.logical_xor, paddle.tensor.logic.logical_xor
-    :old_api: paddle.fluid.layers.logical_xor
 
-    ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
 
         out = (x || y) \&\& !(x \&\& y)
 
+    .. note::
+        ``paddle.logical_xor`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+
     Args:
-        x(${x_type}): ${x_comment}.
-        y(${y_type}): ${y_comment}.
-        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
+        x (Tensor): the input tensor, it's data type should be bool.
+        y (Tensor): the input tensor, it's data type should be bool.
+        out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        ${out_type}: ${out_comment}
+        N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``.
 
     Examples:
         .. code-block:: python
@@ -12230,17 +12235,13 @@ def logical_xor(x, y, out=None, name=None):
             import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True, True, False, False], dtype=np.bool)
-            y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.to_variable(x_data)
-            y = paddle.to_variable(y_data)
+            x_data = np.array([True, False], dtype=np.bool).reshape([2, 1])
+            y_data = np.array([True, False, True, False], dtype=np.bool).reshape([2, 2])
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
             res = paddle.logical_xor(x, y)
-            print(res.numpy()) # [False  True  True False]
+            print(res.numpy()) # [[False,  True], [ True, False]]
     """
-    if x.shape != y.shape:
-        raise TypeError(
-            'Input tensors must be same shape, but received x \'s shape: %s, y \'s shape: %s '
-            % (x.shape, y.shape))
     return _logical_op(
         op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True)
 
@@ -14098,17 +14099,11 @@ def sign(x):
 
 def unique(x, dtype='int32'):
     """
-    :alias_main: paddle.unique
-	:alias: paddle.unique,paddle.tensor.unique,paddle.tensor.manipulation.unique
-	:old_api: paddle.fluid.layers.unique
-
-    **unique**
-
     Return a unique tensor for `x` and an index tensor pointing to this unique tensor.
 
     Args:
-        x(Variable): A 1-D input tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of index tensor: int32, int64.
+        x(Tensor): A 1-D input tensor, it's data type should be float32, float64, int32, int64.
+        dtype(np.dtype|str, optional): The type of index tensor: int32, int64. Default: int32.
 
     Returns:
         tuple: (out, index). `out` is the unique tensor for `x`, with identical dtype to `x`, and \
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index bc1368b562d7b354ce34dc87679fd8a0c5a3d012..fe8ed83923e88be2a0c98a8a539f26500b43b7cb 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -38,6 +38,7 @@ __all__ = [
     'Decoder',
     'BeamSearchDecoder',
     'rnn',
+    'birnn',
     'dynamic_decode',
     'DecodeHelper',
     'TrainingHelper',
@@ -438,61 +439,146 @@ def rnn(cell,
         is_reverse=False,
         **kwargs):
     """
-	:api_attr: Static Graph
-
     rnn creates a recurrent neural network specified by RNNCell `cell`,
-    which performs :code:`cell.call()` repeatedly until reaches to the maximum
-    length of `inputs`.
-
-    Parameters:
-        cell(RNNCell): An instance of `RNNCell`.
-        inputs(Variable): A (possibly nested structure of) tensor variable[s]. 
-            The shape of tensor should be `[batch_size, sequence_length, ...]`
-            for `time_major == False` or `[sequence_length, batch_size, ...]`
-            for `time_major == True`. It represents the inputs to be unrolled
-            in RNN.
-        initial_states(Variable, optional): A (possibly nested structure of)
-            tensor variable[s], representing the initial state for RNN. 
-            If not provided, `cell.get_initial_states` would be used to produce
-            the initial state. Default None.
-        sequence_length(Variable, optional): A tensor with shape `[batch_size]`.
-            It stores real length of each instance, thus enables users to extract
-            the last valid state when past a batch element's sequence length for
-            correctness. If not provided, the paddings would be treated same as
-            non-padding inputs. Default None.
-        time_major(bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
-        is_reverse(bool, optional): Indicate whether to calculate in the reverse
-            order of input sequences. Default: `False`.
-        **kwargs: Additional keyword arguments. Arguments passed to `cell.call`. 
+    which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`) 
+    repeatedly until reaches to the maximum length of `inputs`.
+
+    Arguments:
+        cell(RNNCellBase): An instance of `RNNCellBase`.
+        inputs(Tensor): the input sequences. 
+            If time_major is True, the shape is 
+            `[time_steps, batch_size, input_size]`
+            else the shape is `[batch_size, time_steps, input_size]`.
+        initial_states(Tensor|tuple|list, optional): the initial state of the 
+            rnn cell. Tensor or a possibly nested structure of tensors. If not 
+            provided, `cell.get_initial_states` would be called to produce
+            the initial state. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Defaults to False.
+        **kwargs: Additional keyword arguments to pass to `forward` of the cell. 
 
     Returns:
-        tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
-            outputs and states, both are Tensor or nested structure of Tensor. \
-            `final_outputs` has the same structure and data types as \
-            the returned `outputs` of :code:`cell.call` , and each Tenser in `final_outputs` \
-            stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
-            for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
-            `final_states` is the counterpart at last time step of initial states, \
-            thus has the same structure with it and has tensors with same shapes \
-            and data types.
+        (outputs, final_states)
+        outputs (Tensor|list|tuple): the output sequence. Tensor or nested 
+            structure of Tensors.
+            If `time_major` is True, the shape of each tensor in outpus is 
+            `[time_steps, batch_size, hidden_size]`, else 
+            `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor|list|tuple): final states. A (possibly nested structure of)
+            tensor[s], representing the final state for RNN. It has the same 
+            structure of intial state. Each tensor in final states has the same
+            shape and dtype as the corresponding tensor in initial states.
             
 
     Examples:
 
         .. code-block:: python
-            
-            import paddle.fluid as fluid
 
-            inputs = fluid.data(name="inputs",
-                                shape=[-1, 32, 128],
-                                dtype="float32")
-            cell = fluid.layers.GRUCell(hidden_size=128)
-            outputs = fluid.layers.rnn(cell=cell, inputs=inputs)
+            import paddle
+            paddle.disable_static()
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+
+            inputs = paddle.rand((4, 23, 16))
+            prev_h = paddle.randn((4, 32))
+            outputs, final_states = paddle.nn.functional.rnn(cell, inputs, prev_h) 
+
     """
+    if in_dygraph_mode():
+        return _rnn_dynamic_graph(cell, inputs, initial_states, sequence_length,
+                                  time_major, is_reverse, **kwargs)
+    else:
+        return _rnn_static_graph(cell, inputs, initial_states, sequence_length,
+                                 time_major, is_reverse, **kwargs)
+
+
+class ArrayWrapper(object):
+    def __init__(self, x):
+        self.array = [x]
+
+    def append(self, x):
+        self.array.append(x)
+        return self
+
+
+def _maybe_copy(state, new_state, step_mask):
+    """update rnn state or just pass the old state through"""
+    new_state = nn.elementwise_mul(new_state, step_mask, axis=0) \
+              + nn.elementwise_mul(state, (1 - step_mask), axis=0)
+    return new_state
+
+
+def _transpose_batch_time(x):
+    perm = [1, 0] + list(range(2, len(x.shape)))
+    return nn.transpose(x, perm)
+
+
+def _rnn_dynamic_graph(cell,
+                       inputs,
+                       initial_states=None,
+                       sequence_length=None,
+                       time_major=False,
+                       is_reverse=False,
+                       **kwargs):
+    time_step_index = 0 if time_major else 1
+    flat_inputs = flatten(inputs)
+    time_steps = flat_inputs[0].shape[time_step_index]
+
+    if not time_major:
+        inputs = map_structure(_transpose_batch_time, inputs)
+
+    if sequence_length is not None:
+        mask = sequence_lod.sequence_mask(
+            sequence_length, maxlen=time_steps, dtype=inputs.dtype)
+        mask = nn.transpose(mask, [1, 0])
+
+    if is_reverse:
+        inputs = map_structure(lambda x: tensor.reverse(x, axis=[0]), inputs)
+        mask = tensor.reverse(mask, axis=[0]) \
+            if sequence_length is not None else None
+
+    states = initial_states
+    outputs = []
+    for i in range(time_steps):
+        step_inputs = map_structure(lambda x: x[i], inputs)
+        step_outputs, new_states = cell(step_inputs, states, **kwargs)
+        if sequence_length is not None:
+            new_states = map_structure(
+                partial(
+                    _maybe_copy, step_mask=mask[i]), states, new_states)
+        states = new_states
+        outputs = map_structure(lambda x: ArrayWrapper(x),
+                                step_outputs) if i == 0 else map_structure(
+                                    lambda x, x_array: x_array.append(x),
+                                    step_outputs, outputs)
+
+    final_outputs = map_structure(
+        lambda x: nn.stack(x.array, axis=time_step_index),
+        outputs)
+
+    if is_reverse:
+        final_outputs = map_structure(
+            lambda x: tensor.reverse(x, axis=time_step_index),
+            final_outputs)
+
+    final_states = new_states
+    return final_outputs, final_states
+
+
+def _rnn_static_graph(cell,
+                      inputs,
+                      initial_states=None,
+                      sequence_length=None,
+                      time_major=False,
+                      is_reverse=False,
+                      **kwargs):
     check_type(inputs, 'inputs', (Variable, list, tuple), 'rnn')
     if isinstance(inputs, (list, tuple)):
         for i, input_x in enumerate(inputs):
@@ -500,30 +586,10 @@ def rnn(cell,
                                      ['float32', 'float64'], 'rnn')
     check_type(initial_states, 'initial_states',
                (Variable, list, tuple, type(None)), 'rnn')
-    if isinstance(initial_states, (list, tuple)):
-        states = map_structure(lambda x: x, initial_states)[0]
-        for i, state in enumerate(states):
-            if isinstance(state, (list, tuple)):
-                for j, state_j in enumerate(state):
-                    check_variable_and_dtype(state_j, 'state_j[' + str(j) + ']',
-                                             ['float32', 'float64'], 'rnn')
-            else:
-                check_variable_and_dtype(state, 'states[' + str(i) + ']',
-                                         ['float32', 'float64'], 'rnn')
 
     check_type(sequence_length, 'sequence_length', (Variable, type(None)),
                'rnn')
 
-    def _maybe_copy(state, new_state, step_mask):
-        # TODO: use where_op
-        new_state = nn.elementwise_mul(
-            new_state, step_mask, axis=0) - nn.elementwise_mul(
-                state, (step_mask - 1), axis=0)
-        return new_state
-
-    def _transpose_batch_time(x):
-        return nn.transpose(x, [1, 0] + list(range(2, len(x.shape))))
-
     def _switch_grad(x, stop=False):
         x.stop_gradient = stop
         return x
@@ -582,6 +648,98 @@ def rnn(cell,
     return (final_outputs, final_states)
 
 
+def birnn(cell_fw,
+          cell_bw,
+          inputs,
+          initial_states=None,
+          sequence_length=None,
+          time_major=False,
+          **kwargs):
+    """
+    birnn creates a bidirectional recurrent neural network specified by 
+    RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` 
+    (for dygraph mode :code:`cell.forward`) repeatedly until reaches to 
+    the maximum length of `inputs` and then concat the ouputs for both RNNs
+    along the last axis.
+
+    Arguments:
+        cell_fw(RNNCellBase): An instance of `RNNCellBase`.
+        cell_bw(RNNCellBase): An instance of `RNNCellBase`.
+        inputs(Tensor): the input sequences. 
+            If time_major is True, the shape is 
+            `[time_steps, batch_size, input_size]`
+            else the shape is `[batch_size, time_steps, input_size]`.
+        initial_states(tuple, optional): A tuple of initial states of 
+            `cell_fw` and `cell_bw`.
+            If not provided, `cell.get_initial_states` would be called to 
+            produce initial state for each cell. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        **kwargs: Additional keyword arguments to pass to `forward` of each cell. 
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the outputs of the bidirectional RNN. It is the 
+            concatenation of the outputs from the forward RNN and backward 
+            RNN along the last axis. 
+            If time major is True, the shape is `[time_steps, batch_size, size]`,
+            else the shape is `[batch_size, time_steps, size]`, where size is
+            `cell_fw.hidden_size + cell_bw.hidden_size`.
+        final_states (tuple): A tuple of the final states of the forward 
+            cell and backward cell.        
+
+    Examples:
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.disable_static()
+
+            cell_fw = paddle.nn.LSTMCell(16, 32)
+            cell_bw = paddle.nn.LSTMCell(16, 32)
+
+            inputs = paddle.rand((4, 23, 16))
+            hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32))
+            hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32))
+            initial_states = ((hf, cf), (hb, cb))
+            outputs, final_states = paddle.nn.functional.birnn(
+                cell_fw, cell_bw, inputs, initial_states)
+        
+    """
+    if initial_states is None:
+        states_fw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
+        states_bw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
+    else:
+        states_fw, states_bw = initial_states
+    outputs_fw, states_fw = rnn(cell_fw,
+                                inputs,
+                                states_fw,
+                                sequence_length,
+                                time_major=time_major,
+                                **kwargs)
+
+    outputs_bw, states_bw = rnn(cell_bw,
+                                inputs,
+                                states_bw,
+                                sequence_length,
+                                time_major=time_major,
+                                is_reverse=True,
+                                **kwargs)
+
+    outputs = map_structure(lambda x, y: tensor.concat([x, y], -1), outputs_fw,
+                            outputs_bw)
+
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
+
+
 class Decoder(object):
     """
 	:api_attr: Static Graph
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 5122f961f48cc58b573359de979fdc111a59c9ad..6220bf62c79c30737f923e744d5670818f54ff6e 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -13,6 +13,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
 list(APPEND DIST_TEST_OPS test_listen_and_serv_op)
+list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -36,7 +37,6 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_base)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_2)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer)
@@ -347,6 +347,7 @@ if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dynamic)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_iterable_dataset)
+  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dataset)
 endif()
 
 if(NOT WITH_GPU OR WIN32 OR APPLE)
@@ -453,8 +454,7 @@ if(WITH_DISTRIBUTE)
     	   py_test_modules(test_fleet_base_2 MODULES test_fleet_base_2 ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_base_3 MODULES test_fleet_base_3 ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_recompute_meta_optimizer MODULES test_fleet_recompute_meta_optimizer ENVS ${dist_ENVS})
-	   py_test_modules(test_fleet_graph_execution_meta_optimizer MODULES test_fleet_graph_execution_meta_optimizer ENVS ${dist_ENVS})
-	   py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
+	       py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
            py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
            py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
@@ -489,6 +489,7 @@ if(WITH_DISTRIBUTE)
         bash_test_modules(test_launch_ps START_BASH test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch START_BASH test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
 
+        # port range (20000, 23000) is reserved for dist-ops
         set(dist_ut_port 20001)
         foreach(TEST_OP ${DIST_TEST_OPS})
             bash_test_modules(${TEST_OP} START_BASH dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
@@ -541,6 +542,7 @@ endif()
 
 add_subdirectory(sequence)
 add_subdirectory(dygraph_to_static)
+add_subdirectory(rnn)
 
 if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
@@ -580,6 +582,7 @@ if(NOT WIN32 AND NOT APPLE)
     set_tests_properties(test_multiprocess_dataloader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
     set_tests_properties(test_multiprocess_dataloader_iterable_dataset_static PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
     set_tests_properties(test_multiprocess_dataloader_iterable_dataset_dynamic PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+    set_tests_properties(test_multiprocess_dataloader_dataset PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 endif()
 
 # setting timeout value for old unittests
diff --git a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
index fe7513ae84238527d25cc28fa40b01f1f099f1c8..863c001f226f86384e2820cb6877ded48cffa119 100644
--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
@@ -17,8 +17,9 @@ from __future__ import print_function
 import os
 import logging
 import tarfile
-
+import tempfile
 import random
+import warnings
 
 import paddle
 import paddle.fluid.incubate.data_generator as data_generator
@@ -57,7 +58,7 @@ def load_dnn_input_record(sent):
 def load_lr_input_record(sent):
     res = []
     for _ in [x.split(':') for x in sent.split()]:
-        res.append(int(_[0]))
+        res.append(int(_[0]) % 10000)
     return res
 
 
@@ -120,9 +121,62 @@ def prepare_data():
     lr_input_dim = res[1]
     logger.info('dnn input dim: %d' % dnn_input_dim)
     logger.info('lr input dim: %d' % lr_input_dim)
+
     return dnn_input_dim, lr_input_dim, train_file_path
 
 
+def gen_fake_line(dnn_data_num=7,
+                  dnn_data_range=1e5,
+                  lr_data_num=5,
+                  lr_data_range=1e5):
+    line = ""
+
+    # for deep data
+    for index in range(dnn_data_num):
+        data = str(random.randint(0, dnn_data_range - 1))
+        if index < dnn_data_num - 1:
+            data += " "
+        line += data
+    line += "\t"
+
+    # for wide data
+    for index in range(lr_data_num):
+        data = str(random.randint(0, lr_data_range - 1)) + ":" + str(1)
+        if index < lr_data_num - 1:
+            data += " "
+        line += data
+    line += "\t"
+
+    # for label
+    line += str(random.randint(0, 1))
+    line += "\n"
+    return line
+
+
+def prepare_fake_data(file_nums=8, file_lines=1000):
+    """
+    Create fake data with same type as avazu_ctr_data
+    """
+    file_dir = tempfile.mkdtemp()
+    warnings.warn("Fake data write in {}".format(file_dir))
+    for file_index in range(file_nums):
+        with open(
+                os.path.join(file_dir,
+                             "ctr_train_data_part_{}".format(file_index)),
+                'w+') as fin:
+            file_str = ""
+            for line_index in range(file_lines):
+                file_str += gen_fake_line()
+            fin.write(file_str)
+            warnings.warn("Write done ctr_train_data_part_{}".format(
+                file_index))
+
+    file_list = [os.path.join(file_dir, x) for x in os.listdir(file_dir)]
+    assert len(file_list) == file_nums
+
+    return file_list
+
+
 if __name__ == "__main__":
     pairwise_reader = DatasetCtrReader()
     pairwise_reader.run_from_stdin()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
new file mode 100644
index 0000000000000000000000000000000000000000..0de898d6dde217ec6d5cdf53611f986f7b04863f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -0,0 +1,220 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Distribute CTR model for test fleet api
+"""
+
+from __future__ import print_function
+
+import shutil
+import tempfile
+import time
+
+import paddle
+import paddle.fluid as fluid
+import os
+import numpy as np
+
+import ctr_dataset_reader
+from test_dist_fleet_heter_base import runtime_main, FleetDistHeterRunnerBase
+from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
+from paddle.distributed.fleet.base.util_factory import fleet_util
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
+    """
+    For test CTR model, using Fleet api
+    """
+
+    def net(self, args, batch_size=4, lr=0.01):
+        """
+        network definition
+
+        Args:
+            batch_size(int): the size of mini-batch for training
+            lr(float): learning rate of training
+        Returns:
+            avg_cost: LoDTensor of cost.
+        """
+        dnn_input_dim, lr_input_dim = int(1e5), int(1e5)
+
+        dnn_data = fluid.layers.data(
+            name="dnn_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        lr_data = fluid.layers.data(
+            name="lr_data",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        label = fluid.layers.data(
+            name="click",
+            shape=[-1, 1],
+            dtype="float32",
+            lod_level=0,
+            append_batch_size=False)
+
+        datas = [dnn_data, lr_data, label]
+
+        if args.reader == "pyreader":
+            self.reader = fluid.io.PyReader(
+                feed_list=datas,
+                capacity=64,
+                iterable=False,
+                use_double_buffer=False)
+
+        # build dnn model
+        dnn_layer_dims = [128, 64, 32, 1]
+        dnn_embedding = fluid.layers.embedding(
+            is_distributed=False,
+            input=dnn_data,
+            size=[dnn_input_dim, dnn_layer_dims[0]],
+            param_attr=fluid.ParamAttr(
+                name="deep_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+        dnn_pool = fluid.layers.sequence_pool(
+            input=dnn_embedding, pool_type="sum")
+        dnn_out = dnn_pool
+
+        # build lr model
+        lr_embbding = fluid.layers.embedding(
+            is_distributed=False,
+            input=lr_data,
+            size=[lr_input_dim, 1],
+            param_attr=fluid.ParamAttr(
+                name="wide_embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+        lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
+
+        with fluid.device_guard("gpu"):
+            for i, dim in enumerate(dnn_layer_dims[1:]):
+                fc = fluid.layers.fc(
+                    input=dnn_out,
+                    size=dim,
+                    act="relu",
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(value=0.01)),
+                    name='dnn-fc-%d' % i)
+                dnn_out = fc
+
+            merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
+            label = fluid.layers.cast(label, dtype="int64")
+            predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            fluid.layers.Print(avg_cost, message="avg_cost")
+
+        self.feeds = datas
+        self.train_file_path = ["fake1", "fake2"]
+        self.avg_cost = avg_cost
+        self.predict = predict
+
+        return avg_cost
+
+    def check_model_right(self, dirname):
+        model_filename = os.path.join(dirname, "__model__")
+
+        with open(model_filename, "rb") as f:
+            program_desc_str = f.read()
+
+        program = fluid.Program.parse_from_string(program_desc_str)
+        with open(os.path.join(dirname, "__model__.proto"), "w") as wn:
+            wn.write(str(program))
+
+    def do_pyreader_training(self, fleet):
+        """
+        do training using dataset, using fetch handler to catch variable
+        Args:
+            fleet(Fleet api): the fleet object of Parameter Server, define distribute training role
+        """
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        fleet.init_worker()
+        exe.run(fluid.default_startup_program())
+        batch_size = 4
+        train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
+        self.reader.decorate_sample_list_generator(train_reader)
+
+        for epoch_id in range(1):
+            self.reader.start()
+            try:
+                pass_start = time.time()
+                while True:
+                    exe.run(program=fluid.default_main_program())
+
+                pass_time = time.time() - pass_start
+            except fluid.core.EOFException:
+                self.reader.reset()
+
+        fleet.stop_worker()
+
+    def do_dataset_training(self, fleet):
+        train_file_list = ctr_dataset_reader.prepare_fake_data()
+
+        exe = fluid.Executor(fluid.CPUPlace())
+
+        fleet.init_worker()
+        exe.run(fluid.default_startup_program())
+
+        thread_num = 1
+        batch_size = 128
+        filelist = fleet_util.get_file_shard(train_file_list)
+        print("filelist: {}".format(filelist))
+
+        # config dataset
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
+        dataset.set_batch_size(batch_size)
+        dataset.set_use_var(self.feeds)
+        pipe_command = 'python ctr_dataset_reader.py'
+        dataset.set_pipe_command(pipe_command)
+
+        dataset.set_filelist(filelist)
+        dataset.set_thread(thread_num)
+
+        for epoch_id in range(1):
+            pass_start = time.time()
+            dataset.set_filelist(filelist)
+            exe.train_from_dataset(
+                program=fluid.default_main_program(),
+                dataset=dataset,
+                fetch_list=[self.avg_cost],
+                fetch_info=["cost"],
+                print_period=2,
+                debug=int(os.getenv("Debug", "0")))
+            pass_time = time.time() - pass_start
+            print("do_dataset_training done. using time {}".format(pass_time))
+        if os.getenv("SAVE_MODEL") == "1":
+            model_dir = tempfile.mkdtemp()
+            fleet.save_inference_model(exe, model_dir,
+                                       [feed.name for feed in self.feeds],
+                                       self.avg_cost)
+            self.check_model_right(model_dir)
+            shutil.rmtree(model_dir)
+
+        fleet.stop_worker()
+        print("do_dataset_training stop worker.")
+
+
+if __name__ == "__main__":
+    runtime_main(TestHeterPsCTR2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_test.sh b/python/paddle/fluid/tests/unittests/dist_test.sh
index 42566f63b68e2c24c459c0bd738455d1ec3bc3da..d5a6490042b20a4f9160c55fbb93e9f2f8092eae 100644
--- a/python/paddle/fluid/tests/unittests/dist_test.sh
+++ b/python/paddle/fluid/tests/unittests/dist_test.sh
@@ -61,7 +61,14 @@ for i in {1..2}; do
     fi
 done
 
+echo "dist space:"
+df -h
+
 #display /tmp/files
+echo "ls /tmp/paddle.*"
 ls -l /tmp/paddle.*
 
+echo "ls -l ./"
+ls -l ./
+
 exit 1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index dd58a49bb55c24a5e126965bff415d9a54cff5ad..af7e73c41464dbd26c476f20d4a1533e37d34ce3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -15,6 +15,7 @@
 import math
 import numpy as np
 import unittest
+import paddle
 from paddle.jit import to_static
 import paddle.fluid as fluid
 from paddle.fluid import ParamAttr
@@ -560,8 +561,8 @@ def train_bmn(args, place, to_static):
     loss_data = []
 
     with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
-        fluid.default_startup_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         global local_random
         local_random = np.random.RandomState(SEED)
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a689354f56757ba754b76e3d407cb7083b95b3b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle.static import InputSpec
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import to_variable, declarative, ProgramTranslator, Layer, jit
+
+import unittest
+
+program_trans = ProgramTranslator()
+
+
+class SimpleNet(Layer):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.linear = fluid.dygraph.Linear(10, 3)
+
+    @declarative(input_spec=[InputSpec(shape=[None, 10], dtype='float32')])
+    def forward(self, x, a=1, b=2):
+        y = self.inner_function(x)
+        return y
+
+    # `declarative` is not essential, add it to test for robustness.
+    @declarative
+    def inner_function(self, x):
+        y = self.linear(x)
+        return y
+
+    def add_func(self, x, y):
+        z = x + y
+        return z
+
+    @declarative(input_spec=[[InputSpec([None, 10]), InputSpec([None, 10])]])
+    def func_with_list(self, l):
+        x, y, int_val = l
+        z = x + y
+        z = z + int_val
+        return z
+
+    @declarative(input_spec=[{
+        'x': InputSpec([None, 10]),
+        'y': InputSpec([None, 10])
+    }])
+    def func_with_dict(self, d):
+        x = d['x']
+        y = d['y']
+        int_val = d['int_val']
+
+        z = x + y
+        z = z + int_val
+
+        return z
+
+    @declarative(input_spec=[[
+        InputSpec([None]), {
+            'x': InputSpec([None, 10]),
+            'y': InputSpec([None, 10])
+        }
+    ]])
+    def func_with_list_dict(self, dl):
+        bias = dl[0]
+        x = dl[1]['x']
+        y = dl[1]['y']
+
+        z = x + y
+        z = z + bias
+
+        return z
+
+
+class TestInputSpec(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def test_with_input_spec(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x = to_variable(np.ones([4, 10]).astype('float32'))
+            y = to_variable(np.ones([4, 10]).astype('float32') * 2)
+            int_val = 4.
+
+            net = SimpleNet()
+
+            # 1. each method holds independent program cache
+            out = net(x)
+            self.assertTrue(len(net.forward.program_cache) == 1)
+
+            # 2. test save load
+            jit.save(net, './simple_net')
+            infer_net = fluid.dygraph.jit.load('./simple_net')
+            pred = infer_net(x)
+            self.assertTrue(np.allclose(out.numpy(), pred.numpy()))
+
+            # 3. we can decorate any method
+            x_2 = to_variable(np.ones([4, 20]).astype('float32'))
+            # uses `declarative(func)` instead of `@declarative`
+            net.add_func = declarative(net.add_func)
+            out = net.add_func(x_2, np.ones([20]).astype('float32'))
+            self.assertTrue(len(net.add_func.program_cache) == 1)
+
+            # 5. test input with list
+            out = net.func_with_list([x, y, int_val])
+
+            # 6. test input with dict
+            out = net.func_with_dict({'x': x, 'y': y, 'int_val': int_val})
+
+            # 7. test input with lits contains dict
+            int_np = np.ones([1]).astype('float32')
+            out = net.func_with_list_dict([int_np, {'x': x, 'y': y}])
+
+    def test_with_error(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x = to_variable(np.ones([4, 10]).astype('float32'))
+            y = to_variable(np.ones([4, 10]).astype('float32') * 2)
+            int_val = 4.
+
+            net = SimpleNet()
+
+            # 1. kwargs and input_spec should not be specificed in same time
+            with self.assertRaises(ValueError):
+                net(x, a=1, other_kwarg=2)
+
+            # 2. requires len(input_spec) <= len(args)
+            with self.assertRaises(ValueError):
+                net.add_func = declarative(
+                    net.add_func,
+                    input_spec=[
+                        InputSpec([-1, 10]), InputSpec([-1, 10]),
+                        InputSpec([10])
+                    ])
+                net.add_func(x, y)
+
+    def test_concrete_program(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x = to_variable(np.ones([4, 10]).astype('float32'))
+            y = to_variable(np.ones([4, 10]).astype('float32') * 2)
+            int_val = 4.
+
+            net = SimpleNet()
+            # We can get concrete_program by specificing InputSpec information. Faking input is no need.
+            net.add_func = declarative(
+                net.add_func,
+                input_spec=[
+                    InputSpec([-1, 10]), InputSpec(
+                        [-1, 10], name='y')
+                ])
+            cp1 = net.add_func.concrete_program
+            self.assertTrue(cp1.inputs[-1].shape == (-1, 10))
+            self.assertTrue(cp1.inputs[-1].name == 'y')
+
+            # generate another program
+            net.add_func = declarative(
+                net.add_func,
+                input_spec=[InputSpec([10]), InputSpec(
+                    [10], name='label')])
+            cp2 = net.add_func.concrete_program
+            self.assertTrue(cp2.inputs[-1].shape == (10, ))
+            self.assertTrue(cp2.inputs[-1].name == 'label')
+            # Note(Aurelius84): New instance will be returned if we use `declarative(foo)` every time.
+            # So number of cache program is 1.
+            self.assertTrue(len(net.add_func.program_cache) == 1)
+            self.assertTrue(cp1 != cp2)
+
+
+def foo_func(a, b, c=1, d=2):
+    z = a + b
+    return z
+
+
+class TestDifferentInputSpecCacheProgram(unittest.TestCase):
+    def test_with_different_input(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            x_data = np.ones([16, 10]).astype('float32')
+            y_data = np.ones([10]).astype('float32') * 2
+            z_data = np.ones([10]).astype('float32') * 2.2
+
+            foo = declarative(foo_func)
+
+            # [16, 10] + [10] (varbase)
+            out_1 = foo(to_variable(x_data), to_variable(y_data))
+            self.assertTrue(np.allclose(x_data + y_data, out_1.numpy()))
+            self.assertTrue(len(foo.program_cache) == 1)
+
+            # [16, 10] + [10] (numpy)
+            out_2 = foo(to_variable(x_data), y_data)
+            self.assertTrue(np.allclose(x_data + y_data, out_2.numpy()))
+            self.assertTrue(len(foo.program_cache) == 1)
+
+            # [16, 10] + [10] (numpy)
+            out_3 = foo(to_variable(x_data), z_data)
+            self.assertTrue(np.allclose(x_data + z_data, out_3.numpy()))
+            # hit cache program
+            self.assertTrue(len(foo.program_cache) == 1)
+
+            # [16, 10] + [10] (numpy) with other different arguments (c=3)
+            out_4 = foo(to_variable(x_data), z_data, 3)
+            self.assertTrue(np.allclose(x_data + z_data, out_4.numpy()))
+            # create a new program
+            self.assertTrue(len(foo.program_cache) == 2)
+
+    def test_get_concrete_program(self):
+
+        foo = declarative(foo_func)
+
+        # 1. specific InputSpec for `x`/`y`
+        concrete_program_1 = foo.get_concrete_program(
+            InputSpec([None, 10]), InputSpec([10]))
+        print(concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 1)
+
+        # 2. specific `c`/`d` explicitly with same default value
+        concrete_program_2 = foo.get_concrete_program(
+            InputSpec([None, 10]), InputSpec([10]), 1, 2)
+        self.assertTrue(concrete_program_2 == concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 1)
+
+        # 3. specific `c` = 2
+        concrete_program_3 = foo.get_concrete_program(
+            InputSpec([None, 10]), InputSpec([10]), c=2)
+        self.assertTrue(concrete_program_3 != concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 2)
+
+        # 4. specific x.shape = [10]
+        concrete_program_4 = foo.get_concrete_program(
+            InputSpec([10]), InputSpec([10]))
+        self.assertTrue(concrete_program_4 != concrete_program_1)
+        self.assertTrue(len(foo.program_cache) == 3)
+
+        # 5. only specific InputSpec of x
+        with self.assertRaises(ValueError):
+            concrete_program_5 = foo.get_concrete_program(InputSpec([10]))
+
+        # 6. specific unknown kwargs `e`=4
+        concrete_program_5 = foo.get_concrete_program(
+            InputSpec([10]), InputSpec([10]), e=4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..88697bc1b36838afd743596cfec036271be33856
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.static import InputSpec
+from paddle.fluid.dygraph.dygraph_to_static.function_spec import FunctionSpec
+
+from test_declarative import foo_func
+
+import unittest
+
+
+class TestFunctionSpec(unittest.TestCase):
+    def test_constructor(self):
+        foo_spec = FunctionSpec(foo_func)
+        args_name = foo_spec.args_name
+        self.assertListEqual(args_name, ['a', 'b', 'c', 'd'])
+        self.assertTrue(foo_spec.dygraph_function == foo_func)
+        self.assertTrue(foo_spec.input_spec is None)
+
+    def test_verify_input_spec(self):
+        a_spec = InputSpec([None, 10], name='a')
+        b_spec = InputSpec([10], name='b')
+
+        # type(input_spec) should be list or tuple
+        with self.assertRaises(TypeError):
+            foo_spec = FunctionSpec(foo_func, input_spec=a_spec)
+
+        # each element of input_spec should be `InputSpec`
+        with self.assertRaises(ValueError):
+            foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, 10])
+
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, b_spec])
+        self.assertTrue(len(foo_spec.flat_input_spec) == 2)
+
+    def test_unified_args_and_kwargs(self):
+        foo_spec = FunctionSpec(foo_func)
+        # case 1: foo(10, 20, c=4)
+        args, kwargs = foo_spec.unified_args_and_kwargs([10, 20], {'c': 4})
+        self.assertTupleEqual(args, (10, 20, 4, 2))
+        self.assertTrue(len(kwargs) == 0)
+
+        # case 2: foo(a=10, b=20, d=4)
+        args, kwargs = foo_spec.unified_args_and_kwargs(
+            [], {'a': 10,
+                 'b': 20,
+                 'd': 4})
+        self.assertTupleEqual(args, (10, 20, 1, 4))
+        self.assertTrue(len(kwargs) == 0)
+
+        # case 3: foo(10, b=20)
+        args, kwargs = foo_spec.unified_args_and_kwargs([10], {'b': 20})
+        self.assertTupleEqual(args, (10, 20, 1, 2))
+        self.assertTrue(len(kwargs) == 0)
+
+        # assert len(self._arg_names) >= len(args)
+        with self.assertRaises(ValueError):
+            foo_spec.unified_args_and_kwargs([10, 20, 30, 40, 50], {'c': 4})
+
+        # assert arg_name should be in kwargs
+        with self.assertRaises(ValueError):
+            foo_spec.unified_args_and_kwargs([10], {'c': 4})
+
+    def test_args_to_input_spec(self):
+        a_spec = InputSpec([None, 10], name='a')
+        b_spec = InputSpec([10], name='b')
+
+        a_tensor = paddle.static.data(name='a_var', shape=[4, 10])
+        b_tensor = paddle.static.data(name='b_var', shape=[4, 10])
+        kwargs = {'c': 1, 'd': 2}
+
+        # case 1
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, b_spec])
+        input_with_spec = foo_spec.args_to_input_spec(
+            (a_tensor, b_tensor, 1, 2), {})
+        self.assertTrue(len(input_with_spec) == 4)
+        self.assertTrue(input_with_spec[0] == a_spec)  # a
+        self.assertTrue(input_with_spec[1] == b_spec)  # b
+        self.assertTrue(input_with_spec[2] == 1)  # c
+        self.assertTrue(input_with_spec[3] == 2)  # d
+
+        # case 2
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec])
+        input_with_spec = foo_spec.args_to_input_spec((a_tensor, b_tensor), {})
+        self.assertTrue(len(input_with_spec) == 2)
+        self.assertTrue(input_with_spec[0] == a_spec)  # a
+        self.assertTupleEqual(input_with_spec[1].shape, (4, 10))  # b.shape
+        self.assertEqual(input_with_spec[1].name, 'b_var')  # b.name
+
+        # case 3
+        # assert kwargs is None if set `input_spec`
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec])
+        with self.assertRaises(ValueError):
+            input_with_spec = foo_spec.args_to_input_spec((a_tensor, b_tensor),
+                                                          {'c': 4})
+
+        # case 4
+        # assert len(args) >= len(self._input_spec)
+        foo_spec = FunctionSpec(foo_func, input_spec=[a_spec, b_spec])
+        with self.assertRaises(ValueError):
+            input_with_spec = foo_spec.args_to_input_spec((a_tensor, ), {})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 0e2bac9fa5b5c9e47ce8a08b0187531a3b83dcee..4d735b565ddbcd0bea4e879f0ae5881e459c8f1d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -21,6 +21,7 @@ import unittest
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = "2"
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph import Embedding, Linear, GRUUnit
@@ -448,8 +449,8 @@ def do_train(args, to_static):
     place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
     ) else fluid.CPUPlace()
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         reader = get_random_input_data(args.batch_size, args.vocab_size,
                                        args.num_labels)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..214cd95d3bc620b3bcadb88e57c7e54a593eaaf4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import io
+import logging
+import os
+import sys
+import unittest
+
+import gast
+import six
+
+import paddle
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
+
+# TODO(liym27): library mock needs to be installed separately in PY2,
+#  but CI environment has not installed mock yet.
+#  After discuss with Tian Shuo, now use mock only in PY3, and use it in PY2 after CI installs it.
+if six.PY3:
+    from unittest import mock
+# else:
+#     import mock
+
+
+class TestLoggingUtils(unittest.TestCase):
+    def setUp(self):
+        self.verbosity_level = 1
+        self.code_level = 3
+        self.translator_logger = logging_utils._TRANSLATOR_LOGGER
+
+    def test_verbosity(self):
+        paddle.jit.set_verbosity(None)
+        os.environ[logging_utils.VERBOSITY_ENV_NAME] = '3'
+        self.assertEqual(logging_utils.get_verbosity(), 3)
+
+        paddle.jit.set_verbosity(self.verbosity_level)
+        self.assertEqual(self.verbosity_level, logging_utils.get_verbosity())
+
+        # String is not supported
+        with self.assertRaises(TypeError):
+            paddle.jit.set_verbosity("3")
+
+        with self.assertRaises(TypeError):
+            paddle.jit.set_verbosity(3.3)
+
+    def test_code_level(self):
+
+        paddle.jit.set_code_level(None)
+        os.environ[logging_utils.CODE_LEVEL_ENV_NAME] = '2'
+        self.assertEqual(logging_utils.get_code_level(), 2)
+
+        paddle.jit.set_code_level(self.code_level)
+        self.assertEqual(logging_utils.get_code_level(), self.code_level)
+
+        paddle.jit.set_code_level(9)
+        self.assertEqual(logging_utils.get_code_level(), 9)
+
+        with self.assertRaises(TypeError):
+            paddle.jit.set_code_level(3.3)
+
+    def test_log(self):
+        stream = io.BytesIO() if six.PY2 else io.StringIO()
+        log = self.translator_logger.logger
+        stdout_handler = logging.StreamHandler(stream)
+        log.addHandler(stdout_handler)
+
+        warn_msg = "test_warn"
+        error_msg = "test_error"
+        log_msg_1 = "test_log_1"
+        log_msg_2 = "test_log_2"
+
+        if six.PY3:
+            with mock.patch.object(sys, 'stdout', stream):
+                logging_utils.warn(warn_msg)
+                logging_utils.error(error_msg)
+                self.translator_logger.verbosity_level = 2
+                logging_utils.log(1, log_msg_1)
+                logging_utils.log(2, log_msg_2)
+
+            result_msg = '\n'.join([warn_msg, error_msg, log_msg_2, ""])
+            self.assertEqual(result_msg, stream.getvalue())
+
+    def test_log_transformed_code(self):
+        source_code = "x = 3"
+        ast_code = gast.parse(source_code)
+
+        stream = io.BytesIO() if six.PY2 else io.StringIO()
+        log = self.translator_logger.logger
+        stdout_handler = logging.StreamHandler(stream)
+        log.addHandler(stdout_handler)
+
+        if six.PY3:
+            with mock.patch.object(sys, 'stdout', stream):
+                paddle.jit.set_code_level(1)
+                logging_utils.log_transformed_code(1, ast_code,
+                                                   "BasicApiTransformer")
+
+                paddle.jit.set_code_level()
+                logging_utils.log_transformed_code(
+                    logging_utils.LOG_AllTransformer, ast_code,
+                    "All Transformers")
+
+            self.assertIn(source_code, stream.getvalue())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 5ec3de5871dd6787c06938a8b771f7d14e54e1e0..a377075062b268723aaa3cb17bfa25d6b181798d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -14,6 +14,7 @@
 
 import time
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
@@ -447,8 +448,8 @@ def train_mobilenet(args, to_static):
     with fluid.dygraph.guard(args.place):
 
         np.random.seed(SEED)
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         if args.model == "MobileNetV1":
             net = MobileNetV1(class_dim=args.class_dim, scale=1.0)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
index 3da60e955deee9b6d4c74ba5ff1a550ae135afdb..f0fbe54f9dbbf93121655e784601467c13b3a70d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -133,7 +133,7 @@ class TestWithTrainAndEval(unittest.TestCase):
             x = fluid.dygraph.to_variable(x_data)
             linear_net(x)
 
-            _, partial_layer = program_translator.get_program_cache().last()[-1]
+            _, partial_layer = linear_net.forward.program_cache.last()[-1]
             # check default mode is for training
             self.assertEqual(partial_layer.program,
                              partial_layer._train_program)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
index 790319936ac015db09e45b9eac799b3bdf0b0250..df2b69297bb4d9167fa3f1a1fe0005a77ededf8a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -19,7 +19,7 @@ import time
 import unittest
 
 import numpy as np
-
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.base import to_variable
@@ -218,8 +218,8 @@ def train(place):
     batch_num = 200
 
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         ptb_model = PtbModel(
             hidden_size=hidden_size,
             vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
index 4813930159744fae362aec7563ea5cda82d958c5..1d211197ebd48f1d0fba87ef807c61a3315ed153 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@@ -16,6 +16,7 @@ import gym
 import math
 import itertools
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.dygraph.nn as nn
 from paddle.fluid.dygraph import to_variable, Layer
@@ -64,8 +65,8 @@ def train(args, place, to_static):
     env.seed(SEED)
 
     with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
-        fluid.default_startup_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         local_random = np.random.RandomState(SEED)
 
         policy = Policy()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 46eb2b42e9265ac7f6340ee0be3a7127e5246eef..6556b2f03bd5304e290792d07d1d969ab255bfdc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -215,8 +215,8 @@ def train(to_static):
     """
     with fluid.dygraph.guard(place):
         np.random.seed(SEED)
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         train_reader = paddle.batch(
             reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
index 0386b7c7a17a0f93040fa18d688347f30f27850d..6cf59c030c00384b225d5d13160f68a3558084b9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -133,7 +133,7 @@ class TestPartialProgramRaiseError(unittest.TestCase):
             x = fluid.dygraph.to_variable(x_data)
             out = net(x)
 
-            program_cache = program_translator.get_program_cache()
+            program_cache = SimpleFcLayer.forward.program_cache
             _, (concrete_program, _) = program_cache.last()
 
             params = concrete_program.parameters
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index 30cba78fec19c169966e85ff43e79c3a00889616..38e4d5ad5480beb195bcc0c3cc21f033df8fbd5d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -331,8 +331,8 @@ def train(train_reader, to_static):
     np.random.seed(SEED)
 
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         se_resnext = SeResNeXt()
         optimizer = optimizer_setting(train_parameters, se_resnext.parameters())
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
index fd5a58be26be43996bbb1f80557512bf974de52f..2aa3396fb7f8534374746329af43d4f823e4d5cf 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
@@ -15,6 +15,7 @@ import time
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.nn import Conv2D, Linear, Embedding
 from paddle.fluid.dygraph import to_variable, ProgramTranslator, declarative
@@ -285,8 +286,8 @@ def train(args, to_static):
 
     with fluid.dygraph.guard(place):
         np.random.seed(SEED)
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         train_reader = fake_data_reader(args.class_num, args.vocab_size,
                                         args.batch_size, args.padding_size)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
index 552a6307f33378e7b35f84e048729d22a063c796..14b9ac2e99584b35ec0821949b56b6ee92076571 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
@@ -108,8 +108,8 @@ def train(conf_dict, to_static):
         place = fluid.CPUPlace()
 
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         conf_dict['dict_size'] = len(vocab)
         conf_dict['seq_len'] = args.seq_len
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
index 7aa465949eb704ee5f23b3ad44f3fc57adb04154..4fc8d27d30cb8f67c30bbcd8dcd30938f906462d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
@@ -18,6 +18,7 @@ import time
 import unittest
 
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 
 import transformer_util as util
@@ -31,10 +32,11 @@ STEP_NUM = 10
 
 
 def train_static(args, batch_generator):
+    paddle.manual_seed(SEED)
+    paddle.framework.random._manual_program_seed(SEED)
     train_prog = fluid.Program()
     startup_prog = fluid.Program()
-    train_prog.random_seed = SEED
-    startup_prog.random_seed = SEED
+
     with fluid.program_guard(train_prog, startup_prog):
         with fluid.unique_name.guard():
             # define input and reader
@@ -128,8 +130,8 @@ def train_static(args, batch_generator):
 def train_dygraph(args, batch_generator):
     with fluid.dygraph.guard(place):
         if SEED is not None:
-            fluid.default_main_program().random_seed = SEED
-            fluid.default_startup_program().random_seed = SEED
+            paddle.manual_seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
         # define data loader
         train_loader = fluid.io.DataLoader.from_generator(capacity=10)
         train_loader.set_batch_generator(batch_generator, places=place)
@@ -220,7 +222,8 @@ def train_dygraph(args, batch_generator):
 
 def predict_dygraph(args, batch_generator):
     with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         # define data loader
         test_loader = fluid.io.DataLoader.from_generator(capacity=10)
@@ -291,7 +294,8 @@ def predict_dygraph(args, batch_generator):
 def predict_static(args, batch_generator):
     test_prog = fluid.Program()
     with fluid.program_guard(test_prog):
-        test_prog.random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
         # define input and reader
         input_field_names = util.encoder_data_input_fields + util.fast_decoder_data_input_fields
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
index 13a97fb7478db8acaa46db3b7a6c4341997193eb..bedca412157f0b4d125f75ee5eabd0145411451b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -20,7 +20,7 @@ import random
 import sys
 import time
 import unittest
-
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import declarative, ProgramTranslator, to_variable
 from paddle.fluid.dygraph.nn import Conv2D, BatchNorm, Linear, Pool2D
@@ -272,8 +272,8 @@ def train(args, fake_data_reader, to_static):
     random.seed(0)
     np.random.seed(0)
     with fluid.dygraph.guard(place):
-        fluid.default_startup_program().random_seed = 1000
-        fluid.default_main_program().random_seed = 1000
+        paddle.manual_seed(1000)
+        paddle.framework.random._manual_program_seed(1000)
 
         video_model = TSM_ResNet("TSM", train_config, 'Train')
 
diff --git a/python/paddle/fluid/tests/unittests/launch_function_helper.py b/python/paddle/fluid/tests/unittests/launch_function_helper.py
index ecfe39b80e9051d332bb8fd2a05de2fa53770e46..13041827ffeabd3d6b79e4f34a67bd09624e54f6 100644
--- a/python/paddle/fluid/tests/unittests/launch_function_helper.py
+++ b/python/paddle/fluid/tests/unittests/launch_function_helper.py
@@ -15,6 +15,7 @@ from multiprocessing import Pool, Process
 import os
 import socket
 from contextlib import closing
+import psutil
 
 
 def launch_func(func, env_dict):
@@ -24,6 +25,21 @@ def launch_func(func, env_dict):
     return proc
 
 
+def wait(procs, timeout=None):
+    # wait
+    decents = []
+    for p in procs:
+        for child in psutil.Process(p.pid).children(recursive=True):
+            decents.append(child)
+
+    gone, alive = psutil.wait_procs(decents, timeout=timeout)
+    for p in alive:
+        p.kill()
+    for p in gone:
+        if p.returncode != 0:
+            sys.exit(1)
+
+
 def _find_free_port(port_set):
     def __free_port():
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index ec6b81f138321f2119a5a5aaf4b5ba9ae8f7e69b..9c3ed13cbb0002231888433a451c7d5e7188244d 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import multiprocessing
 import os
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid import compiler
@@ -64,10 +65,11 @@ class TestParallelExecutorBase(unittest.TestCase):
                 feed_data_reader, FeedDataReader
             ), "feed_data_reader must be type of FeedDataReader"
 
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main = fluid.Program()
         startup = fluid.Program()
-        startup.random_seed = 1
-        main.random_seed = 1
+
         with fluid.program_guard(main, startup):
             feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder,
                                               main, method, optimizer)
diff --git a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f71e04c09aa38b8cf7b3a167b84d4dc0e6cc3ec7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/dygraph/backward_strategy.py b/python/paddle/fluid/tests/unittests/rnn/__init__.py
similarity index 76%
rename from python/paddle/fluid/dygraph/backward_strategy.py
rename to python/paddle/fluid/tests/unittests/rnn/__init__.py
index bfcf66af31ce13b3394b5b091882b1976f9f003a..abf198b97e6e818e1fbe59006f98492640bcee54 100644
--- a/python/paddle/fluid/dygraph/backward_strategy.py
+++ b/python/paddle/fluid/tests/unittests/rnn/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,9 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from paddle.fluid import core
-
-__all__ = ["BackwardStrategy"]
-
-BackwardStrategy = core.BackwardStrategy
diff --git a/python/paddle/fluid/tests/unittests/rnn/convert.py b/python/paddle/fluid/tests/unittests/rnn/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..02f10694a4b47e8a58e2fd0db4453cafedcbbdc1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/convert.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+
+
+def convert_params_for_cell(np_cell, paddle_cell):
+    state = np_cell.parameters
+    for k, v in paddle_cell.named_parameters():
+        v.set_value(state[k])
+
+
+def convert_params_for_cell_static(np_cell, paddle_cell, place):
+    state = np_cell.parameters
+    for k, v in paddle_cell.named_parameters():
+        scope = paddle.static.global_scope()
+        tensor = scope.find_var(v.name).get_tensor()
+        tensor.set(state[k], place)
+
+
+def convert_params_for_net(np_net, paddle_net):
+    for np_layer, paddle_layer in zip(np_net, paddle_net):
+        if hasattr(np_layer, "cell"):
+            convert_params_for_cell(np_layer.cell, paddle_layer.cell)
+        else:
+            convert_params_for_cell(np_layer.cell_fw, paddle_layer.cell_fw)
+            convert_params_for_cell(np_layer.cell_bw, paddle_layer.cell_bw)
+
+
+def convert_params_for_net_static(np_net, paddle_net, place):
+    for np_layer, paddle_layer in zip(np_net, paddle_net):
+        if hasattr(np_layer, "cell"):
+            convert_params_for_cell_static(np_layer.cell, paddle_layer.cell,
+                                           place)
+        else:
+            convert_params_for_cell_static(np_layer.cell_fw,
+                                           paddle_layer.cell_fw, place)
+            convert_params_for_cell_static(np_layer.cell_bw,
+                                           paddle_layer.cell_bw, place)
diff --git a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e0b8374b95cf334b4eced550a79d7c717c07aa7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
@@ -0,0 +1,516 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+
+class LayerMixin(object):
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+
+class LayerListMixin(LayerMixin):
+    def __init__(self, layers=None):
+        self._layers = list(layers) if layers else []
+
+    def append(self, layer):
+        self._layers.append(layer)
+
+    def __iter__(self):
+        return iter(self._layers)
+
+
+class SimpleRNNCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        if nonlinearity == 'tanh':
+            self.nonlinearity = np.tanh
+        else:
+            self.nonlinearity = lambda x: np.maximum(x, 0.)
+
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std,
+                                             (hidden_size, )).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std,
+                                             (hidden_size, )).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        return np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_h = hx
+        i2h = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            i2h += self.bias_ih
+        h2h = np.matmul(pre_h, self.weight_hh.T)
+        if self.bias_hh is not None:
+            h2h += self.bias_hh
+        h = self.nonlinearity(i2h + h2h)
+        return h, h
+
+
+class GRUCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            3 * hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            3 * hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std, (
+                3 * hidden_size)).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std, (
+                3 * hidden_size)).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        return np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_hidden = hx
+        x_gates = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            x_gates = x_gates + self.bias_ih
+        h_gates = np.matmul(pre_hidden, self.weight_hh.T)
+        if self.bias_hh is not None:
+            h_gates = h_gates + self.bias_hh
+
+        x_r, x_z, x_c = np.split(x_gates, 3, 1)
+        h_r, h_z, h_c = np.split(h_gates, 3, 1)
+
+        r = 1.0 / (1.0 + np.exp(-(x_r + h_r)))
+        z = 1.0 / (1.0 + np.exp(-(x_z + h_z)))
+        c = np.tanh(x_c + r * h_c)  # apply reset gate after mm
+        h = (pre_hidden - c) * z + c
+        return h, h
+
+
+class LSTMCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            4 * hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            4 * hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std, (
+                4 * hidden_size)).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std, (
+                4 * hidden_size)).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        init_h = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        init_c = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        return init_h, init_c
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_hidden, pre_cell = hx
+        gates = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            gates = gates + self.bias_ih
+        gates += np.matmul(pre_hidden, self.weight_hh.T)
+        if self.bias_hh is not None:
+            gates = gates + self.bias_hh
+
+        chunked_gates = np.split(gates, 4, -1)
+
+        i = 1.0 / (1.0 + np.exp(-chunked_gates[0]))
+        f = 1.0 / (1.0 + np.exp(-chunked_gates[1]))
+        o = 1.0 / (1.0 + np.exp(-chunked_gates[3]))
+        c = f * pre_cell + i * np.tanh(chunked_gates[2])
+        h = o * np.tanh(c)
+
+        return h, (h, c)
+
+
+def sequence_mask(lengths, max_len=None):
+    if max_len is None:
+        max_len = np.max(lengths)
+    else:
+        assert max_len >= np.max(lengths)
+    return np.arange(max_len) < np.expand_dims(lengths, -1)
+
+
+def update_state(mask, new, old):
+    if not isinstance(old, (tuple, list)):
+        return np.where(mask, new, old)
+    else:
+        return tuple(map(lambda x, y: np.where(mask, x, y), new, old))
+
+
+def rnn(cell,
+        inputs,
+        initial_states,
+        sequence_length=None,
+        time_major=False,
+        is_reverse=False):
+    if not time_major:
+        inputs = np.transpose(inputs, [1, 0, 2])
+    if is_reverse:
+        inputs = np.flip(inputs, 0)
+
+    if sequence_length is None:
+        mask = None
+    else:
+        mask = np.transpose(sequence_mask(sequence_length), [1, 0])
+        mask = np.expand_dims(mask, -1)
+        if is_reverse:
+            mask = np.flip(mask, 0)
+
+    time_steps = inputs.shape[0]
+    state = initial_states
+    outputs = []
+    for t in range(time_steps):
+        x_t = inputs[t]
+        if mask is not None:
+            m_t = mask[t]
+            y, new_state = cell(x_t, state)
+            y = np.where(m_t, y, 0.)
+            outputs.append(y)
+            state = update_state(m_t, new_state, state)
+        else:
+            y, new_state = cell(x_t, state)
+            outputs.append(y)
+            state = new_state
+
+    outputs = np.stack(outputs)
+    final_state = state
+
+    if is_reverse:
+        outputs = np.flip(outputs, 0)
+    if not time_major:
+        outputs = np.transpose(outputs, [1, 0, 2])
+    return outputs, final_state
+
+
+def birnn(cell_fw,
+          cell_bw,
+          inputs,
+          initial_states,
+          sequence_length=None,
+          time_major=False):
+    states_fw, states_bw = initial_states
+    outputs_fw, states_fw = rnn(cell_fw,
+                                inputs,
+                                states_fw,
+                                sequence_length,
+                                time_major=time_major)
+
+    outputs_bw, states_bw = rnn(cell_bw,
+                                inputs,
+                                states_bw,
+                                sequence_length,
+                                time_major=time_major,
+                                is_reverse=True)
+
+    outputs = np.concatenate((outputs_fw, outputs_bw), -1)
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
+
+
+def flatten(nested):
+    return list(_flatten(nested))
+
+
+def _flatten(nested):
+    for item in nested:
+        if isinstance(item, (list, tuple)):
+            for subitem in _flatten(item):
+                yield subitem
+        else:
+            yield item
+
+
+def unstack(array, axis=0):
+    num = array.shape[axis]
+    sub_arrays = np.split(array, num, axis)
+    return [np.squeeze(sub_array, axis) for sub_array in sub_arrays]
+
+
+def dropout(array, p=0.5):
+    if p == 0.0:
+        return array
+
+    mask = (np.random.uniform(size=array.shape) < (1 - p)).astype(array.dtype)
+    return array * (mask / (1 - p))
+
+
+def split_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        states = unstack(states)
+        if not bidirectional:
+            return states
+        else:
+            return list(zip(states[::2], states[1::2]))
+    else:
+        assert len(states) == state_components
+        states = tuple([unstack(item) for item in states])
+        if not bidirectional:
+            return list(zip(*states))
+        else:
+            states = list(zip(*states))
+            return list(zip(states[::2], states[1::2]))
+
+
+def concat_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        return np.stack(flatten(states))
+    else:
+        states = flatten(states)
+        componnets = []
+        for i in range(state_components):
+            componnets.append(states[i::state_components])
+        return [np.stack(item) for item in componnets]
+
+
+class RNN(LayerMixin):
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            # for non-dygraph mode, `rnn` api uses cell.call
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        final_outputs, final_states = rnn(self.cell,
+                                          inputs,
+                                          initial_states=initial_states,
+                                          sequence_length=sequence_length,
+                                          time_major=self.time_major,
+                                          is_reverse=self.is_reverse)
+        return final_outputs, final_states
+
+
+class BiRNN(LayerMixin):
+    def __init__(self, cell_fw, cell_bw, time_major=False):
+        super(BiRNN, self).__init__()
+        self.cell_fw = cell_fw
+        self.cell_bw = cell_bw
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if isinstance(initial_states, (list, tuple)):
+            assert len(initial_states) == 2, \
+                "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+
+        outputs, final_states = birnn(self.cell_fw, self.cell_bw, inputs,
+                                      initial_states, sequence_length,
+                                      self.time_major)
+        return outputs, final_states
+
+
+class RNNMixin(LayerListMixin):
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        batch_size = inputs.shape[batch_index]
+        dtype = inputs.dtype
+        if initial_states is None:
+            state_shape = (self.num_layers * self.num_directions, batch_size,
+                           self.hidden_size)
+            if self.state_components == 1:
+                initial_states = np.zeros(state_shape, dtype)
+            else:
+                initial_states = tuple([
+                    np.zeros(state_shape, dtype)
+                    for _ in range(self.state_components)
+                ])
+
+        states = split_states(initial_states, self.num_directions == 2,
+                              self.state_components)
+        final_states = []
+
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = dropout(inputs, self.dropout)
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
+
+        final_states = concat_states(final_states, self.num_directions == 2,
+                                     self.state_components)
+        return outputs, final_states
+
+
+class SimpleRNN(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 nonlinearity="tanh",
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(SimpleRNN, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                                        nonlinearity)
+                cell_bw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                                        nonlinearity)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
+
+
+class LSTM(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(LSTM, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = LSTMCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = LSTMCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = LSTMCell(input_size, hidden_size)
+            cell_bw = LSTMCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 2
+
+
+class GRU(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(GRU, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = GRUCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = GRUCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = GRUCell(input_size, hidden_size)
+            cell_bw = GRUCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = GRUCell(2 * hidden_size, hidden_size)
+                cell_bw = GRUCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d2677229a03f7bdac14a93e176747ba0a5f1d6b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.framework.set_default_dtype("float64")
+
+import numpy as np
+import unittest
+
+from rnn_numpy import SimpleRNNCell, LSTMCell, GRUCell
+from convert import convert_params_for_cell
+
+
+class TestSimpleRNNCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestSimpleRNNCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = SimpleRNNCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.SimpleRNNCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestGRUCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestGRUCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = GRUCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.GRUCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTMCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestLSTMCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTMCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.LSTMCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+        prev_c = np.random.randn(4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+        y2, (h2, c2) = rnn2(
+            paddle.to_variable(x),
+            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, (h1, c1) = rnn1(x)
+        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for bias in [True, False]:
+        for device in devices:
+            for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]:
+                suite.addTest(test_class(bias, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..948e47d5b99462c363015936f84058e222d548e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
@@ -0,0 +1,326 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.framework.set_default_dtype("float64")
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_cell_static
+from rnn_numpy import SimpleRNNCell, LSTMCell, GRUCell
+
+
+class TestSimpleRNNCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestSimpleRNNCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = SimpleRNNCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.SimpleRNNCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp,
+                             feed=feed_dict,
+                             fetch_list=[y, h],
+                             use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestGRUCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestGRUCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = GRUCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.GRUCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp,
+                             feed=feed_dict,
+                             fetch_list=[y, h],
+                             use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTMCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestLSTMCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTMCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.LSTMCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+        prev_c = np.random.randn(4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                init_c = paddle.data(
+                    "init_c", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data, (init_h, init_c))
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h, init_c.name: prev_c}
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, (h1, c1) = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp,
+                                 feed=feed_dict,
+                                 fetch_list=[y, h, c],
+                                 use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for bias in [True, False]:
+        for device in devices:
+            for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]:
+                suite.addTest(test_class(bias, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef297b3bb62497073fd667238cae8a83daaa4967
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.set_default_dtype("float64")
+from paddle.fluid.layers import sequence_mask
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_net
+from rnn_numpy import SimpleRNN, LSTM, GRU
+
+
+class TestSimpleRNN(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestSimpleRNN, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn2 = paddle.nn.SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestGRU(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestGRU, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = GRU(16,
+                   32,
+                   2,
+                   time_major=self.time_major,
+                   direction=self.direction)
+        rnn2 = paddle.nn.GRU(16,
+                             32,
+                             2,
+                             time_major=self.time_major,
+                             direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestLSTM(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestLSTM, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn2 = paddle.nn.LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+        prev_c = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+        y2, (h2, c2) = rnn2(
+            paddle.to_variable(x),
+            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, (h1, c1) = rnn1(x)
+        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, (h2, c2) = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for direction in ["forward", "backward", "bidirectional"]:
+        for time_major in [True, False]:
+            for device in devices:
+                for test_class in [TestSimpleRNN, TestLSTM, TestGRU]:
+                    suite.addTest(test_class(time_major, direction, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..90ed6b8b4c9075f5a3e3925bb80e24c81a37869c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.set_default_dtype("float64")
+from paddle.fluid.layers import sequence_mask
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_net_static
+from rnn_numpy import SimpleRNN, LSTM, GRU
+
+
+class TestSimpleRNN(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestSimpleRNN, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.SimpleRNN(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone().clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, h = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestGRU(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestGRU, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = GRU(16,
+                   32,
+                   2,
+                   time_major=self.time_major,
+                   direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.GRU(16,
+                                     32,
+                                     2,
+                                     time_major=self.time_major,
+                                     direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, h = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTM(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestLSTM, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.LSTM(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+        prev_c = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                init_c = paddle.data(
+                    "init_c", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data, (init_h, init_c))
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h, init_c.name: prev_c}
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, (h1, c1) = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, (h, c) = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
+    for direction in ["forward", "backward", "bidirectional"]:
+        for time_major in [True, False]:
+            for device in devices:
+                for test_class in [TestSimpleRNN, TestLSTM, TestGRU]:
+                    suite.addTest(test_class(time_major, direction, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..278d7b27c528803211e21ae7b1f1190e3053bcc4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import numpy as np
+import unittest
+
+import paddle
+
+# used by model.run_trainer in test_dist_base
+from test_dist_base import RUN_STEP
+
+
+# NOTE: compatible TestParallelDyGraphRunnerBase args
+class SpawnAssistTestArgs(object):
+    update_method = "local"
+    trainer_id = 0
+
+
+class TestDistSpawnRunner(unittest.TestCase):
+    def setUp(self):
+        # NOTE(chenweihang): keep consistent with
+        # TestDistBase.check_with_place
+        self.nprocs = 2
+
+    def _run(self, model, args):
+        args.update_method = "local"
+        return model.run_trainer_with_spawn(args)
+
+    def _run_parallel(self, model, args):
+        args.update_method = "nccl2"
+        context = paddle.distributed.spawn(
+            func=model.run_trainer_with_spawn,
+            args=(args, ),
+            nprocs=self.nprocs,
+            join=True)
+        result_list = []
+        for res_queue in context.return_queues:
+            result_list.append(res_queue.get())
+        return result_list
+
+    def check_dist_result_with_spawn(self, test_class, delta=1e-3):
+        # 0. prepare model and args
+        model = test_class()
+        args = SpawnAssistTestArgs()
+
+        # 1. calc signal card loss
+        losses = self._run(model, args)
+
+        # 2. calc multi card loss (nccl mode)
+        dist_losses_list = self._run_parallel(model, args)
+
+        # 3. compare losses
+        for step_id in range(RUN_STEP):
+            loss = losses[step_id]
+            dist_loss_sum = None
+            for dist_losses in dist_losses_list:
+                if dist_loss_sum is None:
+                    dist_loss_sum = np.array(dist_losses[step_id])
+                else:
+                    dist_loss_sum += np.array(dist_losses[step_id])
+            dist_loss = dist_loss_sum / self.nprocs
+            self.assertAlmostEqual(
+                loss,
+                dist_loss,
+                delta=delta,
+                msg="The results of single-card execution and multi-card execution are inconsistent."
+                "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n".
+                format(loss, dist_loss))
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
old mode 100644
new mode 100755
diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
index 969a7da3b71b69296f3313342adbf989c60edb50..2c6c018b9dfac13d97c242e1f36adbddf9dbf3f1 100644
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
 
 
 class TestAdadeltaOp1(OpTest):
@@ -108,5 +110,54 @@ class TestAdadeltaOp2(OpTest):
         self.check_output()
 
 
+class TestAdadeltaV2(unittest.TestCase):
+    def test_adadelta_dygraph(self):
+        paddle.disable_static(paddle.CPUPlace())
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Adadelta(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adadelta(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.Adadelta(learning_rate=0.1)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(ValueError, paddle.optimizer.Adadelta, None)
+        self.assertRaises(
+            ValueError, paddle.optimizer.Adadelta, learning_rate=0.1, rho=None)
+        self.assertRaises(
+            ValueError,
+            paddle.optimizer.Adadelta,
+            learning_rate=0.1,
+            epsilon=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 990499858ca52f5b471211aa659e64d3e13fccc3..d4aafcd27a5aceb3c0b5fa9ddf8343d404bddbf5 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -456,8 +456,9 @@ class TestAdamOpV2(unittest.TestCase):
         state_dict = adam.state_dict()
         adam.set_state_dict(state_dict)
 
-        #learning_rate is Decay
-        learning_rate = fluid.dygraph.CosineDecay(0.1, 10000, 120)
+        #learning_rate is _LRScheduler
+        learning_rate = paddle.optimizer.CosineAnnealingLR(
+            learning_rate=0.1, T_max=10)
         adam = paddle.optimizer.Adam(
             learning_rate=learning_rate,
             weight_decay=fluid.regularizer.L2Decay(0.001),
@@ -498,15 +499,10 @@ class TestAdamOpV2(unittest.TestCase):
         adam.set_lr(lr)
         cur_lr = adam.get_lr()
         assert (lr == cur_lr)
-
-        lr_var = paddle.create_global_var(shape=[1], value=lr, dtype='float32')
-        adam.set_lr(lr_var)
-        cur_lr = adam.get_lr()
-        assert (np.float32(lr) == cur_lr)
-
         with self.assertRaises(TypeError):
-            lr = int(1)
-            adam.set_lr(lr)
+            lr_var = paddle.create_global_var(
+                shape=[1], value=lr, dtype='float32')
+            adam.set_lr(lr_var)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_api.py b/python/paddle/fluid/tests/unittests/test_adamax_api.py
index f6946dc80b5e55b2e7149f357fe0600916a4fe9f..5a33e11d2862c037639b1643a2e44ff81a757053 100644
--- a/python/paddle/fluid/tests/unittests/test_adamax_api.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py
@@ -26,7 +26,7 @@ class TestAdamaxAPI(unittest.TestCase):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_variable(value)
-        linear = paddle.nn.Linear(13, 5, dtype="float32")
+        linear = paddle.nn.Linear(13, 5)
         adam = paddle.optimizer.Adamax(
             learning_rate=0.01,
             parameters=linear.parameters(),
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index ddb70d6e6400c8e7ae71cabf92ce8060e220a7da..0a7cf54e2e0f15e51ba1b6f7526837f53c7cc2e0 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -23,7 +23,7 @@ class TestAdamWOp(unittest.TestCase):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_variable(value)
-        linear = paddle.nn.Linear(13, 5, dtype="float32")
+        linear = paddle.nn.Linear(13, 5)
         adam = paddle.optimizer.AdamW(
             learning_rate=0.01,
             parameters=linear.parameters(),
@@ -38,7 +38,7 @@ class TestAdamWOp(unittest.TestCase):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_variable(value)
-        linear = paddle.nn.Linear(13, 5, dtype="float32")
+        linear = paddle.nn.Linear(13, 5)
         adam = paddle.optimizer.AdamW(
             learning_rate=0.0,
             parameters=linear.parameters(),
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a135cea52903a0d896df2d446b58d99e5a18993
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def avg_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        field_size = (r_end - r_start) \
+            if (exclusive or adaptive) else (ksize[0])
+        if data_type == np.int8 or data_type == np.uint8:
+            out[:, :, i] = (np.rint(
+                np.sum(x_masked, axis=(2, 3)) / field_size)).astype(data_type)
+        else:
+            out[:, :, i] = (np.sum(x_masked, axis=(2)) /
+                            field_size).astype(data_type)
+    return out
+
+
+class TestPool1d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_adaptive_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def test_adaptive_avg_pool1d(self):
+        for place in self.places:
+            self.check_adaptive_avg_dygraph_results(place)
+            self.check_adaptive_avg_static_results(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..875fdf9e9c3f9a9b891ecc6911dfeda788eee271
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def max_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        out[:, :, i] = np.max(x_masked, axis=(2))
+    return out
+
+
+class TestPool1d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_adaptive_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def test_adaptive_max_pool1d(self):
+        for place in self.places:
+            self.check_adaptive_max_dygraph_results(place)
+            self.check_adaptive_max_static_results(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d78788eb1e7c63be485210780db25e1de6fd84b4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
+                            pool_type="max"):
+
+    N = x.shape[0]
+    C, H, W = [x.shape[1], x.shape[2], x.shape[3]] if data_format == 'NCHW' \
+        else [x.shape[3], x.shape[1], x.shape[2]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        output_size = [H_out, W_out]
+    else:
+        H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = H
+        H_out = H
+    if output_size[1] == None:
+        output_size[1] = W
+        W_out = W
+
+    out = np.zeros((N, C, H_out, W_out)) if data_format=='NCHW' \
+        else np.zeros((N, H_out, W_out, C))
+
+    for i in range(H_out):
+        in_h_start = adaptive_start_index(i, H, output_size[0])
+        in_h_end = adaptive_end_index(i, H, output_size[0])
+
+        for j in range(W_out):
+            in_w_start = adaptive_start_index(j, W, output_size[1])
+            in_w_end = adaptive_end_index(j, W, output_size[1])
+
+            if data_format == 'NCHW':
+                x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
+                elif pool_type == 'max':
+                    out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+            elif data_format == 'NHWC':
+                x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
+                elif pool_type == 'max':
+                    out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
+    return out
+
+
+class TestAdaptiveMaxPool2dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="max")
+        """
+        self.res_4_np = adaptive_pool2d_forward(
+            x=self.x_np,
+            output_size=[3, 3],
+            pool_type="max",
+            data_format="NHWC")
+        """
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[2, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool2d(
+            #    x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[None, 3])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, return_indices=False, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[2, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool2d(
+            #    x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool2d(
+                x=x, output_size=[None, 3])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="max")
+
+        #self.res_4_np = adaptive_pool2d_forward(
+        #    x=self.x_np,
+        #    output_size=[3, 3],
+        #    pool_type="max",
+        #    data_format="NHWC")
+
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[2, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #    adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            #        output_size=[3, 3], data_format="NHWC")
+            #    out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_max_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[2, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            #    output_size=[3, 3], data_format="NHWC")
+            #out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_max_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
new file mode 100755
index 0000000000000000000000000000000000000000..a7de0a5c6a7017617124b893313e0f9830cc09f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
@@ -0,0 +1,293 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool3d_forward(x,
+                            output_size,
+                            adaptive=True,
+                            data_format='NCDHW',
+                            pool_type='max'):
+
+    N = x.shape[0]
+    C, D, H, W = [x.shape[1], x.shape[2], x.shape[3], x.shape[4]] \
+        if data_format == 'NCDHW' else [x.shape[4], x.shape[1], x.shape[2],x.shape[3]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        D_out = output_size
+        output_size = [D_out, H_out, W_out]
+    else:
+        D_out, H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = D
+        D_out = D
+    if output_size[1] == None:
+        output_size[1] = H
+        H_out = H
+    if output_size[2] == None:
+        output_size[2] = W
+        W_out = W
+
+    out = np.zeros((N, C, D_out, H_out, W_out)) if data_format=='NCDHW' \
+        else np.zeros((N, D_out, H_out, W_out, C))
+    for k in range(D_out):
+        d_start = adaptive_start_index(k, D, output_size[0])
+        d_end = adaptive_end_index(k, D, output_size[0])
+
+        for i in range(H_out):
+            h_start = adaptive_start_index(i, H, output_size[1])
+            h_end = adaptive_end_index(i, H, output_size[1])
+
+            for j in range(W_out):
+                w_start = adaptive_start_index(j, W, output_size[2])
+                w_end = adaptive_end_index(j, W, output_size[2])
+
+                if data_format == 'NCDHW':
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
+                                 w_end]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, :, k, i, j] = np.sum(x_masked,
+                                                    axis=(2, 3, 4)) / field_size
+                    elif pool_type == 'max':
+                        out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+
+                elif data_format == 'NDHWC':
+                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
+                                 w_end, :]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, k, i, j, :] = np.sum(x_masked,
+                                                    axis=(1, 2, 3)) / field_size
+                    elif pool_type == 'max':
+                        out[:, k, i, j, :] = np.max(x_masked, axis=(1, 2, 3))
+    return out
+
+
+class TestAdaptiveMaxPool3dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="max")
+
+        self.res_4_np = adaptive_pool3d_forward(
+            x=self.x_np,
+            output_size=[3, 3, 3],
+            pool_type="max",
+            data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool3d(
+            #    x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_max_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            #out_4 = paddle.nn.functional.adaptive_max_pool3d(
+            #    x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_max_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="max")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="max")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="max")
+
+        # self.res_4_np = adaptive_pool3d_forward(
+        #     x=self.x_np,
+        #     output_size=[3, 3, 3],
+        #     pool_type="max",
+        #     data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="max")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            #         output_size=[3, 3, 3], data_format="NDHWC")
+            #     out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_max_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            #     assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(output_size=5)
+            out_2 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_max_pool(x=x)
+
+            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            #         output_size=[3, 3, 3], data_format="NDHWC")
+            #     out_4 = adaptive_max_pool(x=x)
+
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_max_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            #     assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index c6d3c6e7d0492b2f4a98a595f015e3b9f4a19e24..5c705378e515eec4c950f6996e2789df603fcda3 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -85,10 +85,35 @@ class TestBatchNorm(unittest.TestCase):
                     y = bn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
+            def compute_v3(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(1.0),
+                            trainable=False),
+                        bias_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(0.0),
+                            trainable=False),
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v4(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2d(
+                        shape[1], weight_attr=False, bias_attr=False)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
             x = np.random.randn(*shape).astype("float32")
             y1 = compute_v1(x, False, False)
             y2 = compute_v2(x)
+            y3 = compute_v3(x, False, False)
+            y4 = compute_v4(x)
             self.assertTrue(np.allclose(y1, y2))
+            self.assertTrue(np.allclose(y3, y4))
 
     def test_static(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..01daea32167d28edbb46d6854872976aed79494e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
@@ -0,0 +1,504 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle
+from paddle.fluid import Program, program_guard
+from paddle.nn.functional import interpolate
+
+
+def cubic_1(x, a):
+    return ((a + 2) * x - (a + 3)) * x * x + 1
+
+
+def cubic_2(x, a):
+    return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a
+
+
+def cubic_interp1d(x0, x1, x2, x3, t):
+    param = [0, 0, 0, 0]
+    a = -0.75
+    x_1 = t
+    x_2 = 1.0 - t
+    param[0] = cubic_2(x_1 + 1.0, a)
+    param[1] = cubic_1(x_1, a)
+    param[2] = cubic_1(x_2, a)
+    param[3] = cubic_2(x_2 + 1.0, a)
+    return x0 * param[0] + x1 * param[1] + x2 * param[2] + x3 * param[3]
+
+
+def value_bound(input, w, h, x, y):
+    access_x = int(max(min(x, w - 1), 0))
+    access_y = int(max(min(y, h - 1), 0))
+    return input[:, :, access_y, access_x]
+
+
+def bicubic_interp_np(input,
+                      out_h,
+                      out_w,
+                      out_size=None,
+                      actual_shape=None,
+                      align_corners=True,
+                      data_layout='kNCHW'):
+    """trilinear interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    batch_size, channel, in_h, in_w = input.shape
+
+    ratio_h = ratio_w = 0.0
+    if out_h > 1:
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+
+    for k in range(out_h):
+        if (align_corners):
+            h = ratio_h * k
+        else:
+            h = ratio_h * (k + 0.5) - 0.5
+        input_y = np.floor(h)
+        y_t = h - input_y
+        for l in range(out_w):
+            if (align_corners):
+                w = ratio_w * l
+            else:
+                w = ratio_w * (l + 0.5) - 0.5
+            input_x = np.floor(w)
+            x_t = w - input_x
+            for i in range(batch_size):
+                for j in range(channel):
+                    coefficients = [0, 0, 0, 0]
+                    for ii in range(4):
+                        access_x_0 = int(max(min(input_x - 1, in_w - 1), 0))
+                        access_x_1 = int(max(min(input_x + 0, in_w - 1), 0))
+                        access_x_2 = int(max(min(input_x + 1, in_w - 1), 0))
+                        access_x_3 = int(max(min(input_x + 2, in_w - 1), 0))
+                        access_y = int(max(min(input_y - 1 + ii, in_h - 1), 0))
+
+                        coefficients[ii] = cubic_interp1d(
+                            input[i, j, access_y, access_x_0],
+                            input[i, j, access_y, access_x_1],
+                            input[i, j, access_y, access_x_2],
+                            input[i, j, access_y, access_x_3], x_t)
+                    out[i, j, k, l] = cubic_interp1d(
+                        coefficients[0], coefficients[1], coefficients[2],
+                        coefficients[3], y_t)
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+    return out.astype(input.dtype)
+
+
+class TestBicubicInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "bicubic_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bicubic_interp_np(input_np, out_h, out_w, self.out_size,
+                                      self.actual_shape, self.align_corners,
+                                      self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'data_layout': self.data_layout
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+
+
+class TestBicubicInterpCase1(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestBicubicInterpCase2(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 8
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestBicubicInterpCase3(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = False
+
+
+class TestBicubicInterpCase4(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+
+
+class TestBicubicInterpCase5(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 11
+        self.out_w = 11
+        self.scale = 0.
+        self.out_size = np.array([6, 4]).astype("int32")
+        self.align_corners = False
+
+
+class TestBicubicInterpCase6(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0
+        self.out_size = np.array([64, 32]).astype("int32")
+        self.align_corners = False
+
+
+class TestBicubicInterpSame(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestBicubicInterpScale(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = [1., 1.]
+        self.align_corners = True
+
+
+class TestBicubicInterpDataLayout(TestBicubicInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 5, 5, 3]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.data_layout = "NHWC"
+
+
+class TestBicubicInterpOpAPI(unittest.TestCase):
+    def test_case(self):
+        np.random.seed(200)
+        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+        dim_data = np.array([12]).astype("int32")
+        shape_data = np.array([12, 12]).astype("int32")
+        actual_size_data = np.array([12, 12]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        with fluid.program_guard(prog, startup_prog):
+
+            x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+
+            dim = fluid.data(name="dim", shape=[1], dtype="int32")
+            shape_tensor = fluid.data(
+                name="shape_tensor", shape=[2], dtype="int32")
+            actual_size = fluid.data(
+                name="actual_size", shape=[2], dtype="int32")
+            scale_tensor = fluid.data(
+                name="scale_tensor", shape=[1], dtype="float32")
+
+            out1 = interpolate(
+                x, size=[12, 12], mode='bicubic', align_corners=False)
+            out2 = interpolate(
+                x, size=[12, dim], mode='bicubic', align_corners=False)
+            out3 = interpolate(
+                x, size=shape_tensor, mode='bicubic', align_corners=False)
+            out4 = interpolate(
+                x, size=[12, 12], mode='bicubic', align_corners=False)
+            out5 = interpolate(
+                x,
+                scale_factor=scale_tensor,
+                mode='bicubic',
+                align_corners=False)
+            out6 = interpolate(
+                x, scale_factor=2.0, mode='bicubic', align_corners=False)
+            out7 = interpolate(
+                x, scale_factor=[2.0, 2.0], mode='bicubic', align_corners=False)
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            results = exe.run(
+                fluid.default_main_program(),
+                feed={
+                    "x": x_data,
+                    "dim": dim_data,
+                    "shape_tensor": shape_data,
+                    "actual_size": actual_size_data,
+                    "scale_tensor": scale_data
+                },
+                fetch_list=[out1, out2, out3, out4, out5, out6, out7],
+                return_numpy=True)
+
+            expect_res = bicubic_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=False)
+            for res in results:
+                self.assertTrue(np.allclose(res, expect_res))
+
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = interpolate(
+                x, size=[12, 12], mode='bicubic', align_corners=False)
+            dy_result = interp.numpy()
+            expect = bicubic_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=False)
+            self.assertTrue(np.allclose(dy_result, expect))
+
+
+class TestBicubicOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of interpoalte must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, interpolate, x1)
+
+            def test_mode_type():
+                # mode must be "BILINEAR" "TRILINEAR" "NEAREST" "BICUBIC"
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+
+                out = interpolate(
+                    x, size=[12, 12], mode='UNKONWN', align_corners=False)
+
+            def test_input_shape():
+                x = fluid.data(name="x", shape=[2], dtype="float32")
+                out = interpolate(
+                    x, size=[12, 12], mode='BICUBIC', align_corners=False)
+
+            def test_align_corcers():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                interpolate(x, size=[12, 12], mode='BICUBIC', align_corners=3)
+
+            def test_out_shape():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x, size=[12], mode='bicubic', align_corners=False)
+
+            def test_attr_data_format():
+                # for 5-D input, data_format only can be NCDHW or NDHWC
+                input = fluid.data(
+                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+                out = interpolate(
+                    input,
+                    size=[4, 8, 4, 5],
+                    mode='trilinear',
+                    data_format='NHWC')
+
+            def test_actual_shape():
+                # the actual_shape  must be Variable.
+                x = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                out = interpolate(
+                    x, size=[12, 12], mode='BICUBIC', align_corners=False)
+
+            def test_scale_value():
+                # the scale must be greater than zero.
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='BICUBIC',
+                    align_corners=False,
+                    scale_factor=-2.0)
+
+            def test_attr_5D_input():
+                # for 5-D input, data_format only can be NCDHW or NDHWC
+                input = fluid.data(
+                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+                out = interpolate(
+                    input,
+                    size=[4, 8, 4, 5],
+                    mode='trilinear',
+                    data_format='NDHWC')
+
+            def test_scale_type():
+                # the scale must be greater than zero.
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                scale = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='bicubic',
+                    align_corners=False,
+                    scale_factor=scale)
+
+            def test_align_mode():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='nearest',
+                    align_corners=False,
+                    align_mode=2,
+                    scale_factor=1.0)
+
+            def test_outshape_and_scale():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='bicubic',
+                    align_corners=False,
+                    scale_factor=None)
+
+            def test_align_corners_and_nearest():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='nearest',
+                    align_corners=True,
+                    scale_factor=None)
+
+            def test_scale_shape():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='nearest',
+                    align_corners=False,
+                    scale_factor=[1, 2, 2])
+
+            def test_scale_value():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='trilinear',
+                    align_corners=False,
+                    scale_factor=[1, 2, 2])
+
+            self.assertRaises(ValueError, test_mode_type)
+            self.assertRaises(ValueError, test_input_shape)
+            self.assertRaises(TypeError, test_align_corcers)
+            self.assertRaises(ValueError, test_attr_data_format)
+            self.assertRaises(TypeError, test_actual_shape)
+            self.assertRaises(ValueError, test_scale_value)
+            self.assertRaises(ValueError, test_out_shape)
+            self.assertRaises(ValueError, test_attr_5D_input)
+            self.assertRaises(TypeError, test_scale_type)
+            self.assertRaises(ValueError, test_align_mode)
+            self.assertRaises(ValueError, test_outshape_and_scale)
+            self.assertRaises(ValueError, test_align_corners_and_nearest)
+            self.assertRaises(ValueError, test_scale_shape)
+            self.assertRaises(ValueError, test_scale_value)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..d139a53c7e2ccc68964457f3142b4ed890d339f2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -0,0 +1,620 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.nn.functional import interpolate
+import paddle
+
+
+def bilinear_interp_np(input,
+                       out_h,
+                       out_w,
+                       out_size=None,
+                       actual_shape=None,
+                       align_corners=True,
+                       align_mode=0,
+                       data_layout='NCHW'):
+    """bilinear interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    batch_size, channel, in_h, in_w = input.shape
+
+    ratio_h = ratio_w = 0.0
+    if out_h > 1:
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+
+    for i in range(out_h):
+        if (align_mode == 0 and not align_corners):
+            h = int(ratio_h * (i + 0.5) - 0.5)
+        else:
+            h = int(ratio_h * i)
+
+        h = max(0, h)
+        hid = 1 if h < in_h - 1 else 0
+        if (align_mode == 0 and not align_corners):
+            idx_src_h = max(ratio_h * (i + 0.5) - 0.5, 0)
+            h1lambda = idx_src_h - h
+        else:
+            h1lambda = ratio_h * i - h
+        h2lambda = 1.0 - h1lambda
+        for j in range(out_w):
+            if (align_mode == 0 and not align_corners):
+                w = int(ratio_w * (j + 0.5) - 0.5)
+            else:
+                w = int(ratio_w * j)
+            w = max(0, w)
+            wid = 1 if w < in_w - 1 else 0
+            if (align_mode == 0 and not align_corners):
+                idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0)
+                w1lambda = idx_src_w - w
+            else:
+                w1lambda = ratio_w * j - w
+            w2lambda = 1.0 - w1lambda
+
+            out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
+                                        w1lambda*input[:, :, h, w+wid]) + \
+                h1lambda*(w2lambda*input[:, :, h+hid, w] +
+                          w1lambda*input[:, :, h+hid, w+wid])
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(input.dtype)
+
+
+class TestBilinearInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners,
+                                       self.align_mode, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase1(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase2(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase3(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase5(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase6(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([65, 33]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpSame(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpActualShape(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpDataLayout(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 5, 5, 3]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = "NHWC"
+
+
+class TestBilinearInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners,
+                                       self.align_mode)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.scale = 0.
+        self.out_size = np.array([6, 15]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = True
+        self.align_mode = 0
+
+
+class TestBilinearInterpScale1(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 2.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale2(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale3(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.5
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = [1.5, 0.5]
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpZero(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 0.2
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpOp_attr_tensor(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+        }
+
+        input_np = np.random.random(self.input_shape).astype("float64")
+        self.inputs = {'X': input_np}
+
+        if self.scale_by_1Dtensor:
+            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
+        elif self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        if self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.out_size is not None:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs['out_h'] = self.out_h
+        self.attrs['out_w'] = self.out_w
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners)
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_h = 3
+        self.out_w = 3
+        self.scale = 0.
+        self.out_size = [3, 3]
+        self.align_corners = True
+
+
+# out_size is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case1(TestBilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = [8, 12]
+        self.align_corners = True
+
+
+# scale is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case2(TestBilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.shape_by_1Dtensor = True
+
+
+# scale is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case3(TestBilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.0
+        self.out_size = None
+        self.align_corners = True
+        self.scale_by_1Dtensor = True
+
+
+class TestBilinearInterpOpAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12])
+        out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim])
+        out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor)
+        out4 = fluid.layers.resize_bilinear(
+            x, out_shape=[4, 4], actual_shape=actual_size)
+        out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor)
+
+        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+        dim_data = np.array([12]).astype("int32")
+        shape_data = np.array([12, 12]).astype("int32")
+        actual_size_data = np.array([12, 12]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x": x_data,
+                              "dim": dim_data,
+                              "shape_tensor": shape_data,
+                              "actual_size": actual_size_data,
+                              "scale_tensor": scale_data
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5],
+                          return_numpy=True)
+
+        expect_res = bilinear_interp_np(
+            x_data, out_h=12, out_w=12, align_corners=True)
+        for res in results:
+            self.assertTrue(np.allclose(res, expect_res))
+
+
+class TestUpsampleBilinear2dInterpOpAPI2_0(unittest.TestCase):
+    def test_case(self):
+
+        # dygraph
+        x_data = np.random.random((1, 3, 6, 6)).astype("float32")
+        upsample = paddle.nn.UpsamplingBilinear2d(scale_factor=[2, 2])
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = upsample(x)
+            expect = bilinear_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=True)
+            self.assertTrue(np.allclose(interp.numpy(), expect))
+
+
+class TestBilinearInterpOpAPI_dy(unittest.TestCase):
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            input_x = paddle.to_tensor(input_data)
+            expect_res = bilinear_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False)
+            out = interpolate(
+                x=input_x, size=[12, 12], mode="bilinear", align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 671efd8c721550256c181059528bead43deb0718..43d485a0a6d24be6e8db32f16fe96a70bb229858 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import Parameter
 import numpy as np
@@ -44,10 +45,10 @@ class InplaceTestBase(unittest.TestCase):
 
     def build_program_and_scope(self):
         self.place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         startup_program = fluid.Program()
         main_program = fluid.Program()
-        startup_program.random_seed = 1
-        main_program.random_seed = 1
 
         scope = fluid.Scope()
         with fluid.program_guard(main_program, startup_program):
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 93b31f052aae14546effef1696d2719dfff6727b..74c01e1424885051faf3e263e6ca26c1269a838e 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -93,6 +93,13 @@ class TestCase4(TestClipOp):
         self.inputs['Min'] = np.array([0.3]).astype('float32')
 
 
+class TestCase5(TestClipOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max = 0.5
+        self.min = 0.5
+
+
 class TestClipOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -112,6 +119,7 @@ class TestClipOpError(unittest.TestCase):
 
 class TestClipAPI(unittest.TestCase):
     def test_clip(self):
+        paddle.enable_static()
         data_shape = [1, 9, 9, 4]
         data = np.random.random(data_shape).astype('float32')
         images = fluid.data(name='image', shape=data_shape, dtype='float32')
@@ -128,15 +136,19 @@ class TestClipAPI(unittest.TestCase):
         out_4 = paddle.clip(images, max=0.7)
         out_5 = paddle.clip(images, min=min)
         out_6 = paddle.clip(images, max=max)
+        out_7 = paddle.clip(images, max=-1.)
+        out_8 = paddle.clip(images)
 
-        res1, res2, res3, res4, res5, res6 = exe.run(
+        res1, res2, res3, res4, res5, res6, res7, res8 = exe.run(
             fluid.default_main_program(),
             feed={
                 "image": data,
                 "min": np.array([0.2]).astype('float32'),
                 "max": np.array([0.8]).astype('float32')
             },
-            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6])
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])
 
         self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
         self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9)))
@@ -144,6 +156,8 @@ class TestClipAPI(unittest.TestCase):
         self.assertTrue(np.allclose(res4, data.clip(max=0.7)))
         self.assertTrue(np.allclose(res5, data.clip(min=0.2)))
         self.assertTrue(np.allclose(res6, data.clip(max=0.8)))
+        self.assertTrue(np.allclose(res7, data.clip(max=-1)))
+        self.assertTrue(np.allclose(res8, data))
 
     def test_clip_dygraph(self):
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -163,10 +177,8 @@ class TestClipAPI(unittest.TestCase):
         paddle.enable_static()
         x1 = fluid.data(name='x1', shape=[1], dtype="int16")
         x2 = fluid.data(name='x2', shape=[1], dtype="int8")
-        x3 = fluid.data(name='x3', shape=[1], dtype="float32")
         self.assertRaises(TypeError, paddle.clip, x=x1, min=0.2, max=0.8)
         self.assertRaises(TypeError, paddle.clip, x=x2, min=0.2, max=0.8)
-        self.assertRaises(Exception, paddle.clip, x=x3)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index b04bd0cbdefbd6618cfc2b2a865c07a52a67d16b..437b8b7befae470ab438cabc40817996cda5c938 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/fluid/tests/unittests/test_compiled_program.py b/python/paddle/fluid/tests/unittests/test_compiled_program.py
index 8430f39578047facdeae6535db134152a7038717..751fed2e56126909d36670b56afbeee9bf2a694f 100644
--- a/python/paddle/fluid/tests/unittests/test_compiled_program.py
+++ b/python/paddle/fluid/tests/unittests/test_compiled_program.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from test_imperative_base import new_program_scope
@@ -29,8 +30,8 @@ class TestCompiledProgram(unittest.TestCase):
         self.label = np.random.randint(
             low=0, high=10, size=[16, 1], dtype=np.int64)
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -46,8 +47,8 @@ class TestCompiledProgram(unittest.TestCase):
 
     def test_compiled_program_base(self):
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -64,8 +65,8 @@ class TestCompiledProgram(unittest.TestCase):
 
     def test_compiled_program_with_data_parallel(self):
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
             exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index a16f21c0f97c0902dd6c26561ed3f707b28ff947..cc0f3745bbf7bb1fae0c0ac430491ac2e0d9b55f 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -34,10 +34,10 @@ def random_reader():
 
 
 def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
+    paddle.manual_seed(1)
+    paddle.framework.random._manual_program_seed(1)
     startup_prog = fluid.Program()
     main_prog = fluid.Program()
-    startup_prog.random_seed = 1
-    main_prog.random_seed = 1
 
     with fluid.unique_name.guard():
         with fluid.program_guard(main_prog, startup_prog):
diff --git a/python/paddle/fluid/tests/unittests/test_diag.py b/python/paddle/fluid/tests/unittests/test_diag.py
index 8bf40459902e09f19a5badce62084841a0a23619..780d57b53310bb5f385a131d4ad52dd6f5e695f0 100644
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ b/python/paddle/fluid/tests/unittests/test_diag.py
@@ -83,6 +83,14 @@ class TestDiagV2Error(unittest.TestCase):
 
             self.assertRaises(TypeError, test_diag_v2_type)
 
+            x = paddle.static.data('data', [3, 3])
+            self.assertRaises(TypeError, paddle.diag, x, offset=2.5)
+
+            self.assertRaises(TypeError, paddle.diag, x, padding_value=[9])
+
+            x = paddle.static.data('data2', [3, 3, 3])
+            self.assertRaises(ValueError, paddle.diag, x)
+
 
 class TestDiagV2API(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index bc858828058079e7d54d3c753807725ce654a778..2919ec5e9ca97b1d59af46a54b2d702cb6de4a14 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -39,9 +39,9 @@ class TestDirectory(unittest.TestCase):
             'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad',
             'paddle.no_grad', 'paddle.save', 'paddle.load',
             'paddle.static.save', 'paddle.static.load',
-            'paddle.BackwardStrategy', 'paddle.ParallelEnv',
-            'paddle.prepare_context', 'paddle.DataParallel', 'paddle.jit',
-            'paddle.jit.TracedLayer', 'paddle.jit.to_static',
+            'paddle.distributed.ParallelEnv',
+            'paddle.distributed.prepare_context', 'paddle.DataParallel',
+            'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
             'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig',
             'paddle.NoamDecay', 'paddle.PiecewiseDecay',
@@ -98,7 +98,6 @@ class TestDirectory(unittest.TestCase):
             'paddle.imperative.enable', 'paddle.imperative.guard',
             'paddle.imperative.grad', 'paddle.imperative.no_grad',
             'paddle.imperative.save', 'paddle.imperative.load',
-            'paddle.imperative.BackwardStrategy',
             'paddle.imperative.ParallelEnv',
             'paddle.imperative.prepare_context',
             'paddle.imperative.DataParalell', 'paddle.imperative.jit',
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index ba292f2d87c376ace317fc3fb9b81ce5c5596eb2..faff81fa84fb5fa66c9ff14f782d2301e3964672 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -23,8 +23,11 @@ import subprocess
 import six
 import argparse
 import pickle
+import random
 import numpy as np
 import time
+
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 import paddle.fluid.dygraph as dygraph
@@ -382,22 +385,22 @@ class TestParallelDyGraphRunnerBase(object):
         raise NotImplementedError(
             "train_one_loop should be implemented by the child classes.")
 
+    def _get_data(self, batch, args):
+        if args.update_method != "local":
+            new_batch = []
+            for offset, item in enumerate(batch):
+                if offset % 2 == args.trainer_id:
+                    new_batch.append(item)
+            return new_batch
+        else:
+            return batch
+
     def run_trainer(self, args):
 
         seed = 90
         device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         place = fluid.CUDAPlace(device_id)
 
-        def _get_data(batch):
-            if args.update_method != "local":
-                new_batch = []
-                for offset, item in enumerate(batch):
-                    if offset % 2 == args.trainer_id:
-                        new_batch.append(item)
-                return new_batch
-            else:
-                return batch
-
         with fluid.dygraph.guard(place):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
@@ -422,7 +425,7 @@ class TestParallelDyGraphRunnerBase(object):
             out_losses = []
             print_to_err(type(self).__name__, "begin to run dygraph training")
             for step_id, data in enumerate(train_reader()):
-                data = _get_data(data)
+                data = self._get_data(data, args)
                 if step_id == RUN_STEP:
                     break
                 loss = self.run_one_loop(model, opt, data)
@@ -444,6 +447,47 @@ class TestParallelDyGraphRunnerBase(object):
                 model.clear_gradients()
         print_to_out(out_losses)
 
+    def run_trainer_with_spawn(self, args):
+        # 1. enable dygraph
+        paddle.disable_static()
+
+        # 2. init seed
+        seed = 90
+        paddle.static.default_startup_program().random_seed = seed
+        paddle.static.default_main_program().random_seed = seed
+        np.random.seed(seed)
+        random.seed = seed
+        # get trainer id
+        args.trainer_id = paddle.distributed.get_rank()
+
+        # 3. init parallel env
+        if args.update_method == "nccl2":
+            paddle.distributed.init_parallel_env()
+
+        # 4. train model
+        model, train_reader, opt = self.get_model()
+        if args.update_method == "nccl2":
+            model = paddle.DataParallel(model)
+
+        out_losses = []
+        for step_id, data in enumerate(train_reader()):
+            data = self._get_data(data, args)
+            if step_id == RUN_STEP:
+                break
+            loss = self.run_one_loop(model, opt, data)
+            out_losses.append(loss.numpy())
+
+            if args.update_method == "nccl2":
+                loss = model.scale_loss(loss)
+
+            loss.backward()
+            if args.update_method == "nccl2":
+                model.apply_collective_grads()
+
+            opt.minimize(loss)
+            model.clear_gradients()
+        return out_losses
+
 
 def runtime_main(test_class):
     parser = argparse.ArgumentParser(description='Run dist test.')
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d744c8299f484fd60a081adb1b3b9eb2834ddef
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
@@ -0,0 +1,388 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+"""
+    high level unit test for distribute fleet.
+"""
+
+import os
+import sys
+import subprocess
+
+import six
+import shutil
+import numpy as np
+import argparse
+from contextlib import closing
+import socket
+import time
+import tempfile
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.base.util_factory import fleet_util
+from paddle.distributed.fleet import fleet
+
+__all__ = ['FleetDistHeterRunnerBase', 'TestFleetHeterBase', 'runtime_main']
+
+RUN_STEP = 5
+LEARNING_RATE = 0.01
+DIST_UT_PORT = 0
+
+
+class FleetDistHeterRunnerBase(object):
+    """
+        run_pserver,run_trainer : after init role, using transpiler split program
+        net : implment by child class, the network of model
+        do training : exe run program
+    """
+
+    def build_role(self, args):
+        environs = {}
+        environs["PADDLE_PSERVERS_IP_PORT_LIST"] = args.endpoints
+        environs["PADDLE_TRAINER_ENDPOINTS"] = args.trainer_endpoints
+        environs[
+            "PADDLE_HETER_TRAINER_IP_PORT_LIST"] = args.heter_trainer_endpoints
+        environs["PADDLE_HETER_TRAINER_DEVICE"] = args.heter_trainer_device
+        environs["TRAINING_ROLE"] = args.role.upper()
+        environs["PADDLE_TRAINERS_NUM"] = args.trainers
+        environs["PADDLE_TRAINER_ID"] = args.current_id
+        if args.role.upper() == "PSERVER":
+            environs["POD_IP"] = args.endpoints.split(",")[int(
+                args.current_id)].split(":")[0]
+            environs["PADDLE_PORT"] = args.endpoints.split(",")[int(
+                args.current_id)].split(":")[1]
+        elif args.role.upper() == "HETER_TRAINER":
+            environs["POD_IP"] = args.heter_trainer_endpoints.split(",")[int(
+                args.current_id)].split(":")[0]
+            environs["PADDLE_PORT"] = args.heter_trainer_endpoints.split(",")[
+                int(args.current_id)].split(":")[1]
+            environs["FLAGS_selected_gpus"] = args.current_id
+
+        for k, v in environs.items():
+            os.environ[k] = str(v)
+
+        self.role = role_maker.PaddleCloudRoleMaker()
+        return self.role
+
+    def build_strategy(self, args):
+        self.strategy = paddle.distributed.fleet.DistributedStrategy()
+        self.strategy.a_sync = True
+
+        return self.strategy
+
+    def build_optimizer(self, avg_cost, strategy):
+        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+    def run_pserver(self, args):
+        fleet.init_server()
+        fleet.run_server()
+
+    def run_dataset_trainer(self, args):
+        out = self.do_dataset_training(fleet)
+
+    def run_pyreader_trainer(self, args):
+        out = self.do_pyreader_training(fleet)
+
+    def net(self, args, batch_size=4, lr=0.01):
+        raise NotImplementedError(
+            "get_model should be implemented by child classes.")
+
+    def do_dataset_training(self, fleet):
+        raise NotImplementedError(
+            "do_dataset_training should be implemented by child classes.")
+
+    def do_pyreader_training(self, fleet):
+        raise NotImplementedError(
+            "do_pyreader_training should be implemented by child classes.")
+
+
+class TestFleetHeterBase(unittest.TestCase):
+    """
+        start_pserver,start_trainer : add start cmd to test
+        run_cluster : using multi process to test distribute program
+    """
+
+    def _setup_config(self):
+        raise NotImplementedError("tests should have _setup_config implemented")
+
+    def tearDown(self):
+        t = time.time() - self.startTime
+        print('%s: %.3f' % (self.__class__.__name__, t))
+
+    def setUp(self):
+        self.startTime = time.time()
+
+        self._mode = "async"
+        self._reader = "pyreader"
+        self._trainers = 2
+        self._pservers = 2
+        self._port_set = set()
+
+        self._heter_device = "gpu"
+
+        global DIST_UT_PORT
+        if DIST_UT_PORT == 0 and os.getenv("PADDLE_DIST_UT_PORT"):
+            DIST_UT_PORT = int(os.getenv("PADDLE_DIST_UT_PORT"))
+
+        if DIST_UT_PORT:
+            print("set begin_port:", DIST_UT_PORT)
+            self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT, DIST_UT_PORT + 1)
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT + 2, DIST_UT_PORT + 3)
+            self._heter_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT + 4, DIST_UT_PORT + 5)
+            DIST_UT_PORT += 6
+        else:
+            self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())
+            self._heter_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())
+
+        self._python_interp = sys.executable
+        self._geo_sgd_need_push_nums = 5
+        self._grad_clip_mode = 0
+        self._setup_config()
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _start_pserver(self, cmd, required_envs):
+        ps0_cmd, ps1_cmd = cmd.format(0), cmd.format(1)
+
+        ps0_pipe = open(tempfile.gettempdir() + "/ps0_err.log", "wb+")
+        ps1_pipe = open(tempfile.gettempdir() + "/ps1_err.log", "wb+")
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps0_pipe,
+            env=required_envs)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps1_pipe,
+            env=required_envs)
+        return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
+
+    def _start_trainer(self, cmd, required_envs):
+        tr0_cmd, tr1_cmd = cmd.format(0), cmd.format(1)
+
+        tr0_pipe = open(tempfile.gettempdir() + "/tr0_err.log", "wb+")
+        tr1_pipe = open(tempfile.gettempdir() + "/tr1_err.log", "wb+")
+
+        tr0_out = open(tempfile.gettempdir() + "/tr0_out.log", "wb+")
+        tr1_out = open(tempfile.gettempdir() + "/tr1_out.log", "wb+")
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(" "),
+            stdout=tr0_out,
+            stderr=tr0_pipe,
+            env=required_envs)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.strip().split(" "),
+            stdout=tr1_out,
+            stderr=tr1_pipe,
+            env=required_envs)
+
+        return tr0_proc, tr1_proc, tr0_pipe, tr1_pipe
+
+    def _start_heter_trainer(self, cmd, required_envs):
+        heter0_cmd, heter1_cmd = cmd.format(0), cmd.format(1)
+
+        heter0_pipe = open(tempfile.gettempdir() + "/heter0_err.log", "wb+")
+        heter1_pipe = open(tempfile.gettempdir() + "/heter1_err.log", "wb+")
+        heter0_out = open(tempfile.gettempdir() + "/heter0_out.log", "wb+")
+        heter1_out = open(tempfile.gettempdir() + "/heter1_out.log", "wb+")
+
+        heter0_proc = subprocess.Popen(
+            heter0_cmd.strip().split(" "),
+            stdout=heter0_out,
+            stderr=heter0_pipe,
+            env=required_envs)
+        heter1_proc = subprocess.Popen(
+            heter1_cmd.strip().split(" "),
+            stdout=heter1_out,
+            stderr=heter1_pipe,
+            env=required_envs)
+
+        return heter0_proc, heter1_proc, heter0_pipe, heter1_pipe
+
+    def _run_cluster(self, model, envs):
+        env = {'GRAD_CLIP': str(self._grad_clip_mode)}
+        python_path = self._python_interp
+        gloo_path = tempfile.mkdtemp()
+
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
+            python_path += " -m coverage run --branch -p"
+        env.update(envs)
+
+        tr_cmd = "{0} {1} --role trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8} --heter_trainer_endpoints {9} --heter_trainer_device {10}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path, self._heter_endpoints, self._heter_device)
+
+        ps_cmd = "{0} {1} --role pserver --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8} --heter_trainer_endpoints {9} --heter_trainer_device {10}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path, self._heter_endpoints, self._heter_device)
+
+        heter_cmd = "{0} {1} --role heter_trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8} --heter_trainer_endpoints {9} --heter_trainer_device {10}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path, self._heter_endpoints, self._heter_device)
+
+        # Run dist train to compare with local results
+        ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
+        tr0, tr1, tr0_pipe, tr1_pipe = self._start_trainer(tr_cmd, env)
+        heter0, heter1, heter0_pipe, heter1_pipe = self._start_heter_trainer(
+            heter_cmd, env)
+
+        # Wait until trainer process terminate
+        while True:
+            stat0 = tr0.poll()
+            time.sleep(0.1)
+            if stat0 is not None:
+                break
+
+        while True:
+            stat1 = tr1.poll()
+            time.sleep(0.1)
+            if stat1 is not None:
+                break
+
+        tr0_out, tr0_err = tr0.communicate()
+        tr1_out, tr1_err = tr1.communicate()
+        print("tr end communicate")
+
+        tr0_ret = tr0.returncode
+        tr1_ret = tr0.returncode
+        print("tr get returncode: {}".format(tr0_ret))
+        if tr0_ret != 0:
+            print(
+                "========================Error tr0_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log"))
+            print(
+                "========================Error tr0_err end==========================="
+            )
+
+        if tr1_ret != 0:
+            print(
+                "========================Error tr1_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log"))
+            print(
+                "========================Error tr1_err end==========================="
+            )
+
+        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
+        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
+
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        ps0_pipe.close()
+        ps1_pipe.close()
+        heter0_pipe.close()
+        heter1_pipe.close()
+
+        ps0.terminate()
+        ps1.terminate()
+        heter0.terminate()
+        heter1.terminate()
+
+        shutil.rmtree(gloo_path)
+        return 0, 0
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": ""
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+
+def runtime_main(test_class):
+    parser = argparse.ArgumentParser(description='Run Fleet test.')
+    parser.add_argument(
+        '--role',
+        type=str,
+        required=True,
+        choices=['pserver', 'trainer', 'heter_trainer'])
+    parser.add_argument('--endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--trainer_endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--heter_trainer_endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--heter_trainer_device', type=str, required=False, default="gpu")
+    parser.add_argument('--gloo_path', type=str, required=False, default="")
+    parser.add_argument('--current_id', type=int, required=False, default=0)
+    parser.add_argument('--trainers', type=int, required=False, default=1)
+    parser.add_argument('--mode', type=str, required=False, default='async')
+    parser.add_argument(
+        '--geo_sgd_need_push_nums', type=int, required=False, default=2)
+    parser.add_argument('--reader', type=str, required=False, default='dataset')
+    args = parser.parse_args()
+
+    model = test_class()
+    role = model.build_role(args)
+    fleet.init(role)
+    strategy = model.build_strategy(args)
+    avg_cost = model.net(args)
+    model.build_optimizer(avg_cost, strategy)
+    fleet_util._set_strategy(strategy)
+    fleet_util._set_role_maker(role)
+
+    if args.role == "pserver" or args.role == "heter_trainer":
+        model.run_pserver(args)
+    else:
+        if args.reader == "dataset":
+            model.run_dataset_trainer(args)
+        else:
+            model.run_pyreader_trainer(args)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3ffd50dc8da16f4a19c8da5383fe7f763aa7a72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import tempfile
+from test_dist_fleet_heter_base import TestFleetHeterBase
+
+
+class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
+    def _setup_config(self):
+        self._mode = "async"
+        self._reader = "dataset"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "1"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "4"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_heter_ctr.py", delta=1e-5, check_error_log=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..3369039661205ef78a3ec0254241c3ed80b771a9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import paddle
+import os
+import math
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.base.util_factory import fleet_util
+from paddle.distributed.fleet import fleet
+
+
+class TestDistFleetHeterProgram(unittest.TestCase):
+    def build_role(self):
+        environs = {}
+        environs[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36012,127.0.0.1:36013"
+        environs["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36014,127.0.0.1:36015"
+        environs[
+            "PADDLE_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:36016,127.0.0.1:36017"
+        environs["PADDLE_HETER_TRAINER_DEVICE"] = "gpu"
+        environs["TRAINING_ROLE"] = "HETER_TRAINER"
+        environs["PADDLE_TRAINERS_NUM"] = 2
+        environs["PADDLE_TRAINER_ID"] = 0
+        environs["POD_IP"] = "127.0.0.1"
+        environs["PADDLE_PORT"] = "36016"
+        environs["FLAGS_selected_gpus"] = 0
+
+        for k, v in environs.items():
+            os.environ[k] = str(v)
+
+        self.role = role_maker.PaddleCloudRoleMaker()
+        return self.role
+
+    def build_strategy(self):
+        self.strategy = paddle.distributed.fleet.DistributedStrategy()
+        self.strategy.a_sync = True
+        return self.strategy
+
+    def build_input(self):
+        dense_input = fluid.layers.data(
+            name="dense_input", shape=[10], dtype="float32")
+
+        sparse_input_ids = [
+            fluid.layers.data(
+                name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
+            for i in range(1, 27)
+        ]
+
+        label = fluid.layers.data(name="label", shape=[1], dtype="float32")
+
+        inputs = [dense_input] + sparse_input_ids + [label]
+        return inputs
+
+    def build_net(self, inputs):
+        def embedding_layer(input):
+            return fluid.layers.embedding(
+                input=input,
+                is_sparse=True,
+                size=[100001, 10],
+                param_attr=fluid.ParamAttr(
+                    name="SparseFeatFactors",
+                    initializer=fluid.initializer.Uniform()), )
+
+        sparse_embed_seq = list(map(embedding_layer, inputs[1:-1]))
+
+        concated = fluid.layers.concat(sparse_embed_seq + inputs[0:1], axis=1)
+
+        with fluid.device_guard("gpu"):
+            fc1 = fluid.layers.fc(
+                input=concated,
+                size=400,
+                act="relu",
+                param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                    scale=1 / math.sqrt(concated.shape[1]))),
+                name="fc1")
+
+        with fluid.device_guard("cpu"):
+            fc2 = fluid.layers.fc(input=fc1,
+                                  size=400,
+                                  act="relu",
+                                  param_attr=fluid.ParamAttr(
+                                      initializer=fluid.initializer.Normal(
+                                          scale=1 / math.sqrt(fc1.shape[1]))),
+                                  name="fc2")
+
+        with fluid.device_guard("gpu"):
+            fc3 = fluid.layers.fc(input=fc2,
+                                  size=400,
+                                  act="relu",
+                                  param_attr=fluid.ParamAttr(
+                                      initializer=fluid.initializer.Normal(
+                                          scale=1 / math.sqrt(fc2.shape[1]))),
+                                  name="fc3")
+
+        with fluid.device_guard("cpu"):
+            predict = fluid.layers.fc(
+                input=fc3,
+                size=2,
+                act="softmax",
+                param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                    scale=1 / math.sqrt(fc3.shape[1]))), )
+
+        with fluid.device_guard("gpu"):
+            labels = fluid.layers.cast(inputs[-1], dtype="int64")
+            cost = fluid.layers.cross_entropy(input=predict, label=labels)
+            avg_cost = fluid.layers.reduce_sum(cost)
+
+        return avg_cost
+
+    def build_optimizer(self, avg_cost, strategy):
+        optimizer = fluid.optimizer.SGD(1e-2)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+    def test(self):
+        role = self.build_role()
+        fleet.init(role)
+        strategy = self.build_strategy()
+        inputs = self.build_input()
+        avg_cost = self.build_net(inputs)
+        self.build_optimizer(avg_cost, strategy)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
index 4ccaa3266e087a38ac38667c62487e69c6bb6bf6..533ad9604cf0d879371796fb197e61e931fb479f 100644
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -105,10 +105,59 @@ class DistributionTest(unittest.TestCase):
             self.gpu_id = 0
         self.executor = fluid.Executor(place)
 
-    def build_normal_common_net(self, batch_size, dims, loc_float, scale_float,
-                                other_loc_float, other_scale_float, scale_np,
-                                other_scale_np, loc_np, other_loc_np, loc,
-                                scale, other_loc, other_scale, values):
+    def build_normal_common_net(self, batch_size, dims, sample_shape, loc_float,
+                                scale_float, other_loc_float, other_scale_float,
+                                scale_np, other_scale_np, loc_np, other_loc_np,
+                                loc, scale, other_loc, other_scale, values):
+        """Generate Normal object and get the output of its methods including
+        ``sample``, ``entropy``, ``log_prob``, ``probs`` and ``kl_divergence``.
+        Parameters ``loc`` and ``scale`` have different data types to test different situations.
+
+        Args:
+          batch_size(int): The first dimension of the shape of parameters(loc and scale).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          loc_float(float): Generated in function ``get_normal_random_input``, loc is a float number.
+          scale_float(float): Generated in function ``get_normal_random_input``, scale is a float number.
+          other_loc_float(float): Generated in function ``get_normal_random_input``, other_loc is a
+            float number. It is the first parameter in another Normal object used in ``kl_divergence``
+            method.
+          other_scale_float(float): Generated in function ``get_normal_random_input``, other_scale is a
+            float number. It is the second parameter in another Normal object used in ``kl_divergence``
+            method.
+          scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
+            whose shape is [batch_size, dims].
+          other_scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_scale_np
+            is an numpy array. It is the second parameter in another Normal object used in ``kl_divergence``
+            method.
+          loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
+            whose shape is [batch_size, dims].
+          other_loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_loc_np
+            is an numpy array. It is the first parameter in another Normal object used in ``kl_divergence``
+            method.
+          loc(Tensor): In dynamic mode, loc is generated in ``build_normal_dygraph``, it's a Tensor filled
+            with ``loc_np`` data. In static mode, loc is generated in ``build_normal_static``, ``layers.data``
+             method is used to get a Placeholder whose shape is [dims].
+          scale(Tensor): In dynamic mode, scale is generated in ``build_normal_dygraph``, it's a Tensor filled
+            with ``scale_np`` data. In static mode, scale is generated in ``build_normal_static``, ``layers.data``
+             method is used to get a Placeholder whose shape is [dims].
+          other_loc(Tensor): In dynamic mode, other_loc is generated in ``build_normal_dygraph``, it's a Tensor
+            filled with ``other_loc_np`` data. In static mode, other_loc is generated in ``build_normal_static``,
+             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the first parameter
+              in another Normal object used in ``kl_divergence`` method.
+          other_scale(Tensor): In dynamic mode, other_scale is generated in ``build_normal_dygraph``, it's a Tensor
+            filled with ``other_scale_np`` data. In static mode, other_scale is generated in ``build_normal_static``,
+             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the second parameter
+              in another Normal object used in ``kl_divergence`` method.
+          values(Tensor): In dynamic mode, values is generated in ``build_normal_dygraph``, it's a Tensor filled with
+             ``values_np`` data. In static mode, values is generated in ``build_normal_static``, ``layers.data``
+             method is used to get a Placeholder whose shape is [dims].
+
+        Returns:
+          List: The elements of the list are the output of sample, entropy, log_prob, probs, kl_divergence methods.
+          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be considered.
+
+        """
         normal_int = Normal(int(loc_float), int(scale_float))
         normal_float = Normal(loc_float, scale_float)
         other_normal_float = Normal(other_loc_float, other_scale_float)
@@ -130,6 +179,13 @@ class DistributionTest(unittest.TestCase):
         sample_np = normal_np.sample([batch_size, dims])
         sample_variable = normal_variable.sample([batch_size, dims])
 
+        sample_int_diff = normal_int.sample([sample_shape])
+        sample_float_diff = normal_float.sample([sample_shape])
+        sample_float_np_broadcast_diff = normal_float_np_broadcast.sample(
+            [sample_shape])
+        sample_np_diff = normal_np.sample([sample_shape])
+        sample_variable_diff = normal_variable.sample([sample_shape])
+
         entropy_int = normal_int.entropy()
         entropy_float = normal_float.entropy()
         entropy_float_np_broadcast = normal_float_np_broadcast.entropy()
@@ -152,7 +208,9 @@ class DistributionTest(unittest.TestCase):
 
         fetch_list = [
             sample_int, sample_float, sample_float_np_broadcast, sample_np,
-            sample_variable, entropy_int, entropy_float,
+            sample_variable, sample_int_diff, sample_float_diff,
+            sample_float_np_broadcast_diff, sample_np_diff,
+            sample_variable_diff, entropy_int, entropy_float,
             entropy_float_np_broadcast, entropy_np, entropy_variable,
             lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
             p_np, p_variable, kl_float, kl_float_np_broadcast, kl_np,
@@ -160,10 +218,22 @@ class DistributionTest(unittest.TestCase):
         ]
         return fetch_list
 
-    def build_normal_static(self, test_program, batch_size, dims, loc_float,
-                            scale_float, other_loc_float, other_scale_float,
-                            scale_np, other_scale_np, loc_np, other_loc_np,
-                            values_np):
+    def build_normal_static(self, test_program, batch_size, dims, sample_shape,
+                            loc_float, scale_float, other_loc_float,
+                            other_scale_float, scale_np, other_scale_np, loc_np,
+                            other_loc_np, values_np):
+        """
+        In static mode, generate feed data of Normal network, and get output fetch_list using
+        ``build_normal_common_net``.
+
+        Args:
+          test_program: In static mode, the Program object.
+          other args can refer to function ``build_normal_common_net``.
+
+        Returns:
+          feed_vars: The feed data of Normal network in static mode.
+          fetch_list: The output is generated by function ``build_normal_common_net``.
+        """
         with fluid.program_guard(test_program):
             loc = layers.data(name='loc', shape=[dims], dtype='float32')
             scale = layers.data(name='scale', shape=[dims], dtype='float32')
@@ -176,9 +246,10 @@ class DistributionTest(unittest.TestCase):
             values = layers.data(name='values', shape=[dims], dtype='float32')
 
             fetch_list = self.build_normal_common_net(
-                batch_size, dims, loc_float, scale_float, other_loc_float,
-                other_scale_float, scale_np, other_scale_np, loc_np,
-                other_loc_np, loc, scale, other_loc, other_scale, values)
+                batch_size, dims, sample_shape, loc_float, scale_float,
+                other_loc_float, other_scale_float, scale_np, other_scale_np,
+                loc_np, other_loc_np, loc, scale, other_loc, other_scale,
+                values)
 
         feed_vars = {
             'loc': loc_np,
@@ -189,9 +260,21 @@ class DistributionTest(unittest.TestCase):
         }
         return feed_vars, fetch_list
 
-    def build_normal_dygraph(self, batch_size, dims, loc_float, scale_float,
-                             other_loc_float, other_scale_float, scale_np,
-                             other_scale_np, loc_np, other_loc_np, values_np):
+    def build_normal_dygraph(self, batch_size, dims, sample_shape, loc_float,
+                             scale_float, other_loc_float, other_scale_float,
+                             scale_np, other_scale_np, loc_np, other_loc_np,
+                             values_np):
+        """
+        In dynamic mode, generate input data of Normal network, and get output fetch_list using
+        ``build_normal_common_net``.
+
+        Args:
+          refer to function ``build_normal_common_net``.
+
+        Returns:
+          fetch_list_numpy: The output is generated by function ``build_normal_common_net``. Transform
+          these tensor to numpy.ndarray.
+        """
         loc = paddle.to_tensor(loc_np)
         scale = paddle.to_tensor(scale_np)
         other_loc = paddle.to_tensor(other_loc_np)
@@ -199,13 +282,24 @@ class DistributionTest(unittest.TestCase):
         values = paddle.to_tensor(values_np)
 
         fetch_list = self.build_normal_common_net(
-            batch_size, dims, loc_float, scale_float, other_loc_float,
-            other_scale_float, scale_np, other_scale_np, loc_np, other_loc_np,
-            loc, scale, other_loc, other_scale, values)
+            batch_size, dims, sample_shape, loc_float, scale_float,
+            other_loc_float, other_scale_float, scale_np, other_scale_np,
+            loc_np, other_loc_np, loc, scale, other_loc, other_scale, values)
         fetch_list_numpy = [t.numpy() for t in fetch_list]
         return fetch_list_numpy
 
     def get_normal_random_input(self, batch_size, dims):
+        """
+        Generate input data ``loc`` and ``scale`` used in Normal network.
+
+        Args:
+          refer to function ``build_normal_common_net``.
+
+        Returns:
+          List: Different data type of ``loc`` and ``scale``, including float, numpy.ndarray.
+          By the way, ``other_loc`` and ``other_scale`` are used in ``kl_divergence`` method.
+          refer to ``args`` in function ``build_normal_common_net``.
+        """
         loc_np = np.random.randn(batch_size, dims).astype('float32')
         other_loc_np = np.random.randn(batch_size, dims).astype('float32')
 
@@ -237,7 +331,20 @@ class DistributionTest(unittest.TestCase):
                                   output_list,
                                   batch_size=2,
                                   dims=3,
+                                  sample_shape=7,
                                   tolerance=1e-6):
+        """
+        Compare the outputs of Normal's methods in paddle and numpy. If the outputs are not consistent,
+        raise errors.
+
+        Args:
+          data_list: Input data generated by function ``get_normal_random_input``.
+          output_list: The outputs of Normal's methods in static or dynamic mode.
+          batch_size(int): The first dimension of the shape of parameters(loc and scale).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          tolerance(float): The tolerance of the error.
+        """
         loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
 
         np_normal_int = NormalNumpy(int(loc_float), int(scale_float))
@@ -254,6 +361,13 @@ class DistributionTest(unittest.TestCase):
         gt_sample_float_np_broadcast = np_normal_float_np_broadcast.sample(
             [batch_size, dims])
         gt_sample_np = np_normal.sample([batch_size, dims])
+
+        gt_sample_int_diff = np_normal_int.sample([sample_shape])
+        gt_sample_float_diff = np_normal_float.sample([sample_shape])
+        gt_sample_float_np_broadcast_diff = np_normal_float_np_broadcast.sample(
+            [sample_shape])
+        gt_sample_np_diff = np_normal.sample([sample_shape])
+
         gt_entropy_int = np_normal_int.entropy()
         gt_entropy_float = np_normal_float.entropy()
         gt_entropy_float_np_broadcast = np_normal_float_np_broadcast.entropy()
@@ -271,7 +385,10 @@ class DistributionTest(unittest.TestCase):
         [
             output_sample_int, output_sample_float,
             output_sample_float_np_broadcast, output_sample_np,
-            output_sample_variable, output_entropy_int, output_entropy_float,
+            output_sample_variable, output_sample_int_diff,
+            output_sample_float_diff, output_sample_float_np_broadcast_diff,
+            output_sample_np_diff, output_sample_variable_diff,
+            output_entropy_int, output_entropy_float,
             output_entropy_float_np_broadcast, output_entropy_np,
             output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
             output_lp_variable, output_p_float_np_broadcast, output_p_np,
@@ -279,31 +396,24 @@ class DistributionTest(unittest.TestCase):
             output_kl_np, output_kl_variable
         ] = output_list
 
-        np.testing.assert_allclose(
-            output_sample_int.shape,
-            gt_sample_int.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_float.shape,
-            gt_sample_float.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_float_np_broadcast.shape,
-            gt_sample_float_np_broadcast.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_np.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_variable.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
+        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
+        np.testing.assert_equal(output_sample_float.shape,
+                                gt_sample_float.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
+                                gt_sample_float_np_broadcast.shape)
+        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_variable.shape,
+                                gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_int_diff.shape,
+                                gt_sample_int_diff.shape)
+        np.testing.assert_equal(output_sample_float_diff.shape,
+                                gt_sample_float_diff.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
+                                gt_sample_float_np_broadcast_diff.shape)
+        np.testing.assert_equal(output_sample_np_diff.shape,
+                                gt_sample_np_diff.shape)
+        np.testing.assert_equal(output_sample_variable_diff.shape,
+                                gt_sample_np_diff.shape)
         np.testing.assert_allclose(
             output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
         np.testing.assert_allclose(
@@ -353,15 +463,22 @@ class DistributionTest(unittest.TestCase):
     def test_normal_distribution_static(self,
                                         batch_size=2,
                                         dims=3,
+                                        sample_shape=7,
                                         tolerance=1e-6):
+        """
+        Test Normal's methods in static mode.
+
+        Args:
+          refer to ``compare_normal_with_numpy`` function.
+        """
         test_program = fluid.Program()
         data_list = self.get_normal_random_input(batch_size, dims)
         loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
 
         feed_vars, fetch_list = self.build_normal_static(
-            test_program, batch_size, dims, loc_float, scale_float,
-            other_loc_float, other_scale_float, scale_np, other_scale_np,
-            loc_np, other_loc_np, values_np)
+            test_program, batch_size, dims, sample_shape, loc_float,
+            scale_float, other_loc_float, other_scale_float, scale_np,
+            other_scale_np, loc_np, other_loc_np, values_np)
         self.executor.run(fluid.default_startup_program())
 
         output_list = self.executor.run(program=test_program,
@@ -369,27 +486,62 @@ class DistributionTest(unittest.TestCase):
                                         fetch_list=fetch_list)
 
         self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
-                                       tolerance)
+                                       sample_shape, tolerance)
 
     def test_normal_distribution_dygraph(self,
                                          batch_size=2,
                                          dims=3,
+                                         sample_shape=7,
                                          tolerance=1e-6):
+        """
+        Test Normal's methods in dynamic mode.
+
+        Args:
+          refer to ``compare_normal_with_numpy`` function.
+        """
         paddle.disable_static()
         data_list = self.get_normal_random_input(batch_size, dims)
         loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
 
         output_list = self.build_normal_dygraph(
-            batch_size, dims, loc_float, scale_float, other_loc_float,
-            other_scale_float, scale_np, other_scale_np, loc_np, other_loc_np,
-            values_np)
+            batch_size, dims, sample_shape, loc_float, scale_float,
+            other_loc_float, other_scale_float, scale_np, other_scale_np,
+            loc_np, other_loc_np, values_np)
 
         self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
-                                       tolerance)
+                                       sample_shape, tolerance)
         paddle.enable_static()
 
-    def build_uniform_common_net(self, batch_size, dims, low_float, high_float,
-                                 high_np, low_np, values_np, low, high, values):
+    def build_uniform_common_net(self, batch_size, dims, sample_shape,
+                                 low_float, high_float, high_np, low_np,
+                                 values_np, low, high, values):
+        """Generate Uniform object and get the output of its methods including ``sample``, ``entropy``,
+         ``log_prob`` and ``probs``.
+        Parameters ``low`` and ``high`` have different data types to test different situations.
+
+        Args:
+          batch_size(int): The first dimension of the shape of parameters(low and high).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          low_float(float): Parameter ``low`` is a float number.
+          high_float(float): Parameter ``high`` is a float number.
+          high_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
+          low_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
+          values_np(numpy.ndarray): The input of ``log_prob`` and ``probs`` methods. An numpy array whose
+            shape is [batch_size, dims].
+          low(Tensor): In dynamic mode, low is generated in ``build_uniform_dygraph``, it's a Tensor filled
+            with ``low_np`` data. In static mode, low is generated in ``build_uniform_static``.
+          high(Tensor): In dynamic mode, high is generated in ``build_uniform_dygraph``, it's a Tensor filled
+            with ``high_np`` data. In static mode, high is generated in ``build_uniform_static``.
+          values(Tensor): In dynamic mode, values is generated in ``build_uniform_dygraph``, it's a Tensor
+            filled with ``values_np`` data. In static mode, values is generated in ``build_uniform_static``.
+
+        Returns:
+          List: The elements of the list are the output of sample, entropy, log_prob, probs methods.
+          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be
+           considered.
+
+        """
         uniform_int = Uniform(int(low_float), int(high_float))
         uniform_float = Uniform(low_float, high_float)
         uniform_float_np_broadcast = Uniform(low_float, high_np)
@@ -403,6 +555,13 @@ class DistributionTest(unittest.TestCase):
         sample_np = uniform_np.sample([batch_size, dims])
         sample_variable = uniform_variable.sample([batch_size, dims])
 
+        sample_int_diff = uniform_int.sample([sample_shape])
+        sample_float_diff = uniform_float.sample([sample_shape])
+        sample_float_np_broadcast_diff = uniform_float_np_broadcast.sample(
+            [sample_shape])
+        sample_np_diff = uniform_np.sample([sample_shape])
+        sample_variable_diff = uniform_variable.sample([sample_shape])
+
         entropy_int = uniform_int.entropy()
         entropy_float = uniform_float.entropy()
         entropy_float_np_broadcast = uniform_float_np_broadcast.entropy()
@@ -419,15 +578,29 @@ class DistributionTest(unittest.TestCase):
 
         fetch_list = [
             sample_int, sample_float, sample_float_np_broadcast, sample_np,
-            sample_variable, entropy_int, entropy_float,
+            sample_variable, sample_int_diff, sample_float_diff,
+            sample_float_np_broadcast_diff, sample_np_diff,
+            sample_variable_diff, entropy_int, entropy_float,
             entropy_float_np_broadcast, entropy_np, entropy_variable,
             lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
             p_np, p_variable
         ]
         return fetch_list
 
-    def build_uniform_static(self, test_program, batch_size, dims, low_float,
-                             high_float, high_np, low_np, values_np):
+    def build_uniform_static(self, test_program, batch_size, dims, sample_shape,
+                             low_float, high_float, high_np, low_np, values_np):
+        """
+        In static mode, generate feed data of Uniform network, and get output fetch_list using
+        ``build_uniform_common_net``.
+
+        Args:
+          test_program: In static mode, the Program object.
+          other args can refer to function ``build_uniform_common_net``.
+
+        Returns:
+          feed_vars: The feed data of Uniform network in static mode.
+          fetch_list: The output is generated by function ``build_uniform_common_net``.
+        """
         with fluid.program_guard(test_program):
             low = layers.data(name='low', shape=[dims], dtype='float32')
             high = layers.data(name='high', shape=[dims], dtype='float32')
@@ -435,21 +608,32 @@ class DistributionTest(unittest.TestCase):
             values = layers.data(name='values', shape=[dims], dtype='float32')
 
             fetch_list = self.build_uniform_common_net(
-                batch_size, dims, low_float, high_float, high_np, low_np,
-                values_np, low, high, values)
+                batch_size, dims, sample_shape, low_float, high_float, high_np,
+                low_np, values_np, low, high, values)
 
         feed_vars = {'low': low_np, 'high': high_np, 'values': values_np}
         return feed_vars, fetch_list
 
-    def build_uniform_dygraph(self, batch_size, dims, low_float, high_float,
-                              high_np, low_np, values_np):
+    def build_uniform_dygraph(self, batch_size, dims, sample_shape, low_float,
+                              high_float, high_np, low_np, values_np):
+        """
+        In dynamic mode, generate input data of Uniform network, and get output fetch_list using
+        ``build_uniform_common_net``.
+
+        Args:
+          refer to function ``build_uniform_common_net``.
+
+        Returns:
+          fetch_list_numpy: The output is generated by function ``build_uniform_common_net``. Transform
+          these tensor to numpy.ndarray.
+        """
         low = paddle.to_tensor(low_np)
         high = paddle.to_tensor(high_np)
         values = paddle.to_tensor(values_np)
 
-        fetch_list = self.build_uniform_common_net(batch_size, dims, low_float,
-                                                   high_float, high_np, low_np,
-                                                   values_np, low, high, values)
+        fetch_list = self.build_uniform_common_net(
+            batch_size, dims, sample_shape, low_float, high_float, high_np,
+            low_np, values_np, low, high, values)
         fetch_list_numpy = [t.numpy() for t in fetch_list]
         return fetch_list_numpy
 
@@ -458,7 +642,20 @@ class DistributionTest(unittest.TestCase):
                                    output_list,
                                    batch_size=2,
                                    dims=3,
+                                   sample_shape=7,
                                    tolerance=1e-6):
+        """
+        Compare the outputs of Uniform's methods in paddle and numpy. If the outputs are not consistent,
+        raise errors.
+
+        Args:
+          data_list: Input data including float and numpy.ndarray type of ``low`` and ``high`` parameters.
+          output_list: The outputs of Uniform's methods in static or dynamic mode.
+          batch_size(int): The first dimension of the shape of parameters(low and high).
+          dims(int): The second dimension of the shape of parameters.
+          sample_shape(int): The sample value used in ``sample`` method.
+          tolerance(float): The tolerance of the error.
+        """
         [low_np, low_float, high_float, high_np, values_np] = data_list
 
         np_uniform_int = UniformNumpy(int(low_float), int(high_float))
@@ -471,6 +668,11 @@ class DistributionTest(unittest.TestCase):
         gt_sample_float_np_broadcast = np_uniform_float_np_broadcast.sample(
             [batch_size, dims])
         gt_sample_np = np_uniform.sample([batch_size, dims])
+        gt_sample_int_diff = np_uniform_int.sample([sample_shape])
+        gt_sample_float_diff = np_uniform_float.sample([sample_shape])
+        gt_sample_float_np_broadcast_diff = np_uniform_float_np_broadcast.sample(
+            [sample_shape])
+        gt_sample_np_diff = np_uniform.sample([sample_shape])
         gt_entropy_int = np_uniform_int.entropy()
         gt_entropy_float = np_uniform_float.entropy()
         gt_entropy_float_np_broadcast = np_uniform_float_np_broadcast.entropy()
@@ -484,38 +686,34 @@ class DistributionTest(unittest.TestCase):
         [
             output_sample_int, output_sample_float,
             output_sample_float_np_broadcast, output_sample_np,
-            output_sample_variable, output_entropy_int, output_entropy_float,
+            output_sample_variable, output_sample_int_diff,
+            output_sample_float_diff, output_sample_float_np_broadcast_diff,
+            output_sample_np_diff, output_sample_variable_diff,
+            output_entropy_int, output_entropy_float,
             output_entropy_float_np_broadcast, output_entropy_np,
             output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
             output_lp_variable, output_p_float_np_broadcast, output_p_np,
             output_p_variable
         ] = output_list
 
-        np.testing.assert_allclose(
-            output_sample_int.shape,
-            gt_sample_int.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_float.shape,
-            gt_sample_float.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_float_np_broadcast.shape,
-            gt_sample_float_np_broadcast.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_np.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_variable.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
+        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
+        np.testing.assert_equal(output_sample_float.shape,
+                                gt_sample_float.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
+                                gt_sample_float_np_broadcast.shape)
+        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_variable.shape,
+                                gt_sample_np.shape)
+        np.testing.assert_equal(output_sample_int_diff.shape,
+                                gt_sample_int_diff.shape)
+        np.testing.assert_equal(output_sample_float_diff.shape,
+                                gt_sample_float_diff.shape)
+        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
+                                gt_sample_float_np_broadcast_diff.shape)
+        np.testing.assert_equal(output_sample_np_diff.shape,
+                                gt_sample_np_diff.shape)
+        np.testing.assert_equal(output_sample_variable_diff.shape,
+                                gt_sample_np_diff.shape)
         np.testing.assert_allclose(
             output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
         np.testing.assert_allclose(
@@ -554,7 +752,14 @@ class DistributionTest(unittest.TestCase):
     def test_uniform_distribution_static(self,
                                          batch_size=2,
                                          dims=3,
+                                         sample_shape=7,
                                          tolerance=1e-6):
+        """
+        Test Uniform's methods in static mode.
+
+        Args:
+          refer to ``compare_uniform_with_numpy`` function.
+        """
         test_program = fluid.Program()
 
         low_np = np.random.randn(batch_size, dims).astype('float32')
@@ -567,8 +772,8 @@ class DistributionTest(unittest.TestCase):
         data_list = [low_np, low_float, high_float, high_np, values_np]
 
         feed_vars, fetch_list = self.build_uniform_static(
-            test_program, batch_size, dims, low_float, high_float, high_np,
-            low_np, values_np)
+            test_program, batch_size, dims, sample_shape, low_float, high_float,
+            high_np, low_np, values_np)
 
         self.executor.run(fluid.default_startup_program())
 
@@ -577,12 +782,19 @@ class DistributionTest(unittest.TestCase):
                                         feed=feed_vars,
                                         fetch_list=fetch_list)
         self.compare_uniform_with_numpy(data_list, output_list, batch_size,
-                                        dims, tolerance)
+                                        dims, sample_shape, tolerance)
 
     def test_uniform_distribution_dygraph(self,
                                           batch_size=2,
                                           dims=3,
+                                          sample_shape=7,
                                           tolerance=1e-6):
+        """
+        Test Uniform's methods in dynamic mode.
+
+        Args:
+          refer to ``compare_uniform_with_numpy`` function.
+        """
         paddle.disable_static()
 
         low_np = np.random.randn(batch_size, dims).astype('float32')
@@ -593,11 +805,12 @@ class DistributionTest(unittest.TestCase):
         values_np = np.random.randn(batch_size, dims).astype('float32')
 
         data_list = [low_np, low_float, high_float, high_np, values_np]
-        output_list = self.build_uniform_dygraph(
-            batch_size, dims, low_float, high_float, high_np, low_np, values_np)
+        output_list = self.build_uniform_dygraph(batch_size, dims, sample_shape,
+                                                 low_float, high_float, high_np,
+                                                 low_np, values_np)
 
         self.compare_uniform_with_numpy(data_list, output_list, batch_size,
-                                        dims, tolerance)
+                                        dims, sample_shape, tolerance)
         paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
index ae4355ec412c87ff1d947338d4f15256ace539b8..88b496c1d89e63752f93bd2abb77c61af7e86c4d 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -27,6 +27,8 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 
+SEED = 123123111
+
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
     def __init__(self,
@@ -105,12 +107,11 @@ class MNIST(fluid.dygraph.Layer):
 
 class TestDygraphMultiForward(unittest.TestCase):
     def test_mnist_forward_float32(self):
-        seed = 90
         epoch_num = 1
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
 
+        with fluid.dygraph.guard():
+            paddle.manual_seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
             mnist = MNIST()
             sgd = SGDOptimizer(
                 learning_rate=1e-3, parameter_list=mnist.parameters())
@@ -142,9 +143,8 @@ class TestDygraphMultiForward(unittest.TestCase):
                             dy_param_init_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
+            paddle.manual_seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index 6c0bb97bf6f14b2cef5050a99c02c7843b86be92..e0c0277270b406bf745a5293e9c1bcbe158e52d0 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -18,6 +18,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
@@ -465,9 +466,9 @@ class PaddingRNNTestBase(unittest.TestCase):
         pass
 
     def _prepare_program(self, config, parallel=True):
+        paddle.manual_seed(config.random_seed)
         self.main_program = fluid.Program()
         self.startup_program = fluid.Program()
-        self.startup_program.random_seed = config.random_seed
         with fluid.program_guard(self.main_program, self.startup_program):
             with fluid.unique_name.guard():
                 res_vars = lm_model(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
index 4fe085ce854726676bc1b1bef650419b3ebbfc86..0b6acc7615395ed99a484e0e56f9c62447a1f345 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -193,6 +193,27 @@ class TestFloorDivideAPI(unittest.TestCase):
                 z_expected = np.array([2., 0., 2.])
                 self.assertEqual((z_expected == z.numpy()).all(), True)
 
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            # divide by zero 
+            np_x = np.array([2, 3, 4])
+            np_y = np.array([0])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            try:
+                z = x // y
+            except Exception as e:
+                print("Error: Divide by zero encounter in floor_divide\n")
+
+            # divide by zero 
+            np_x = np.array([2])
+            np_y = np.array([0, 0, 0])
+            x = paddle.to_tensor(np_x, dtype="int32")
+            y = paddle.to_tensor(np_y, dtype="int32")
+            try:
+                z = x // y
+            except Exception as e:
+                print("Error: Divide by zero encounter in floor_divide\n")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
index 25769a42aa261c0b5ae9fe2795a337c668580a99..f5d8b4f704da8acd97475444346522f63d3724fd 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -204,6 +204,22 @@ class TestRemainderAPI(unittest.TestCase):
                 z_expected = np.array([1., 0., 1., 1., 0., 1.])
                 self.assertEqual((z_expected == z.numpy()).all(), True)
 
+                np_x = np.array([-3.3, 11.5, -2, 3.5])
+                np_y = np.array([-1.2, 2., 3.3, -2.3])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x % y
+                z_expected = np.array([-0.9, 1.5, 1.3, -1.1])
+                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+
+                np_x = np.array([-3, 11, -2, 3])
+                np_y = np.array([-1, 2, 3, -2])
+                x = paddle.to_tensor(np_x, dtype="int64")
+                y = paddle.to_tensor(np_y, dtype="int64")
+                z = x % y
+                z_expected = np.array([0, 1, 1, -1])
+                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
index 5a562dc14650a74ee6f76fa3d8c5f207da6475d6..c18b7c5b044e76fdfbb53b68633cfaf222190a38 100644
--- a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import six
 import unittest
@@ -37,13 +38,13 @@ class TestEmbeddingIdStopGradientBase(unittest.TestCase):
             self.assertTrue(np.array_equal(grad_value1, grad_value2))
 
     def run_program(self, place, stop_gradient=False):
+        np.random.seed(1)
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+
         startup_program = fluid.Program()
         main_program = fluid.Program()
 
-        np.random.seed(1)
-        startup_program.random_seed = 1
-        main_program.random_seed = 1
-
         scope = fluid.Scope()
         with fluid.program_guard(main_program, startup_program):
             with fluid.scope_guard(scope):
diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_op.py
index 69ed9f141437c307dc9e43fb501000d5cafeeaf7..150aff78508c61031a97bb56c9f14c4485cecea1 100755
--- a/python/paddle/fluid/tests/unittests/test_expand_as_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_op.py
@@ -102,8 +102,23 @@ class TestExpandAsOpRank4(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+# Test dygraph API
+class TestExpandAsDygraphAPI(unittest.TestCase):
+    def test_api(self):
+        import paddle
+        paddle.disable_static()
+        np_data_x = np.array([1, 2, 3]).astype('int32')
+        np_data_y = np.array([1, 2, 3, 1, 2, 3]).astype('int32')
+        data_x = paddle.to_tensor(np_data_x)
+        data_y = paddle.to_tensor(np_data_y)
+        out = fluid.layers.expand_as(data_x, data_y)
+        np_out = out.numpy()
+        assert np.array_equal(np_out, np.tile(np_data_x, (2)))
+        paddle.enable_static()
+
+
 # Test python API
-class TestExpandAPI(unittest.TestCase):
+class TestExpandAsAPI(unittest.TestCase):
     def test_api(self):
         input1 = np.random.random([12, 14]).astype("float32")
         input2 = np.random.random([48, 14]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index e5a7e6c702aec114968adb7fb23309c9d944559d..ec30cb70c579092b1ee03b9debc9a26dcb19926e 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import unittest
+import paddle
 import numpy as np
 from op_test import OpTest
 import paddle.fluid as fluid
@@ -135,31 +136,32 @@ class TestFCOpWithPadding(TestFCOp):
 
 class TestFcOp_NumFlattenDims_NegOne(unittest.TestCase):
     def test_api(self):
-        startup_program = Program()
-        main_program = Program()
-        startup_program.random_seed = SEED
-        main_program.random_seed = SEED
-
-        with program_guard(main_program, startup_program):
-            input = np.random.random([2, 2, 25]).astype("float32")
-            x = fluid.layers.data(
-                name="x",
-                shape=[2, 2, 25],
-                append_batch_size=False,
-                dtype="float32")
-
-            out_1 = fluid.layers.fc(input=x, size=1, num_flatten_dims=-1)
-            out_2 = fluid.layers.fc(input=x, size=1, num_flatten_dims=2)
-
-        place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place=place)
-        exe.run(startup_program)
-        res_1, res_2 = exe.run(main_program,
-                               feed={"x": input},
-                               fetch_list=[out_1, out_2])
-
-        assert np.array_equal(res_1, res_2)
+        def run_program(num_flatten_dims):
+            paddle.manual_seed(SEED)
+            startup_program = Program()
+            main_program = Program()
+
+            with program_guard(main_program, startup_program):
+                input = np.random.random([2, 2, 25]).astype("float32")
+                x = fluid.layers.data(
+                    name="x",
+                    shape=[2, 2, 25],
+                    append_batch_size=False,
+                    dtype="float32")
+
+                out = fluid.layers.fc(input=x,
+                                      size=1,
+                                      num_flatten_dims=num_flatten_dims)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = fluid.Executor(place=place)
+            exe.run(startup_program)
+            out = exe.run(main_program, feed={"x": input}, fetch_list=[out])
+
+        res_1 = run_program(-1)
+        res_2 = run_program(2)
+        self.assertTrue(np.array_equal(res_1, res_2))
 
 
 class TestFCOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
index f5e888ab0eb3ca597bf62245ff9f3024fe81ee95..25801793f1f2e70c404727ed4f64c7d3c830aec9 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
@@ -43,7 +43,7 @@ class TestFleetBase(unittest.TestCase):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         strategy = fleet.DistributedStrategy()
-        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
index 2ac1dfc0303defaadd48d5d7457643405a4f09a0..9eec73116cc283b58d3ee39cefb9256e12d4ef15 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
@@ -15,25 +15,37 @@
 import unittest
 import paddle
 import os
-from launch_function_helper import launch_func, _find_free_port
+from launch_function_helper import launch_func, wait, _find_free_port
 
 
 class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        try:
+            self._dist_ut_port_0 = int(os.environ["PADDLE_DIST_UT_PORT"])
+            self._dist_ut_port_1 = self._dist_ut_port_0 + 1
+        except Exception as e:
+            self._dist_ut_port_0 = _find_free_port(set())
+            self._dist_ut_port_1 = _find_free_port(set())
+
     def test_graph_execution_optimizer_not_apply(self):
+        port_a = self._dist_ut_port_0
+        port_b = self._dist_ut_port_1
         node_a = {
             "PADDLE_TRAINER_ID": "0",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36003",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
 
         node_b = {
             "PADDLE_TRAINER_ID": "1",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36004",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
@@ -65,14 +77,11 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         proc_a.start()
         proc_b = launch_func(node_func, node_b)
         proc_b.start()
-        proc_a.join()
-        proc_b.join()
+        wait([proc_a, proc_b])
 
     def test_graph_execution_optimizer(self):
-
-        port_set = set()
-        port_a = _find_free_port(port_set)
-        port_b = _find_free_port(port_set)
+        port_a = self._dist_ut_port_0 + 2
+        port_b = self._dist_ut_port_1 + 2
 
         node_a = {
             "PADDLE_TRAINER_ID": "0",
@@ -138,24 +147,27 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         proc_a.start()
         proc_b = launch_func(node_func, node_b)
         proc_b.start()
-        proc_a.join()
-        proc_b.join()
+        wait([proc_a, proc_b])
 
     def test_graph_execution_optimizer_not_apply_v2(self):
+        port_a = self._dist_ut_port_0 + 4
+        port_b = self._dist_ut_port_1 + 4
         node_a = {
             "PADDLE_TRAINER_ID": "0",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36003",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
 
         node_b = {
             "PADDLE_TRAINER_ID": "1",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36004",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
@@ -187,24 +199,27 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         proc_a.start()
         proc_b = launch_func(node_func, node_b)
         proc_b.start()
-        proc_a.join()
-        proc_b.join()
+        wait([proc_a, proc_b])
 
     def test_graph_execution_optimizer(self):
+        port_a = self._dist_ut_port_0 + 6
+        port_b = self._dist_ut_port_1 + 6
         node_a = {
             "PADDLE_TRAINER_ID": "0",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36001",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
 
         node_b = {
             "PADDLE_TRAINER_ID": "1",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36002",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
             "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
             "http_proxy": "",
             "https_proxy": ""
         }
@@ -253,8 +268,7 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         proc_a.start()
         proc_b = launch_func(node_func, node_b)
         proc_b.start()
-        proc_a.join()
-        proc_b.join()
+        wait([proc_a, proc_b])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index 06f8da84a28d22127fad122d39d4e3903fdf25bf..47671ab3a85e8596d5b677f5e1cf9f6ebecaf155 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net
 from fake_reader import fake_imdb_reader
 from parallel_executor_test_base import TestParallelExecutorBase
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index 62eef67a5695f62c3594824c24886e23a5a59801..921dbdbc6d4e1b169c2c8aa199ea15f886bd0128 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -19,8 +19,6 @@ import unittest
 
 class TestFuseBatchNormActPass(unittest.TestCase):
     def build_program(self, main_program, startup_program, use_cuda, seed=1):
-        main_program.random_seed = seed
-        startup_program.random_seed = seed
         with fluid.program_guard(main_program, startup_program):
             x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
             y = fluid.layers.data(name="y", shape=[1], dtype='int64')
@@ -59,6 +57,8 @@ class TestFuseBatchNormActPass(unittest.TestCase):
         return x, y, loss
 
     def check(self, place, use_cuda):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_program = fluid.Program()
         startup_program = fluid.Program()
         x, y, loss = self.build_program(main_program, startup_program, use_cuda)
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
index b47bcd2a032a32f30b2bcdd2b48541c660abdab2..a22daeedd09e9a1da3a17773fed43d35ece51bec 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net
 from fake_reader import fake_imdb_reader
 from parallel_executor_test_base import TestParallelExecutorBase
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 6b08c4250f61c9680a13b21f1c6c2e940c60ca75..9ab84404073906a8a95f9eb562cbe220e7c6b455 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
@@ -37,6 +37,7 @@ class TestGaussianRandomOp(OpTest):
             "seed": 10,
             "use_mkldnn": self.use_mkldnn
         }
+        paddle.manual_seed(10)
 
         self.outputs = {'Out': np.zeros((123, 92), dtype='float32')}
 
diff --git a/python/paddle/fluid/tests/unittests/test_generator.py b/python/paddle/fluid/tests/unittests/test_generator.py
index 6cc43d3d5498284e8a24dd272eaed08cdf830733..8b1f420358d3187bd4746431fefe449a4d6ed2ec 100644
--- a/python/paddle/fluid/tests/unittests/test_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_generator.py
@@ -16,6 +16,7 @@
 from __future__ import print_function
 import os
 import unittest
+import paddle
 import paddle.fluid.generator as generator
 import time  # temp for debug
 
@@ -34,10 +35,11 @@ class TestGenerator(unittest.TestCase):
         st = gen.get_state()
         gen.set_state(st)
         gen.random()
-        gen.set_cpu_engine(gen.get_cpu_engine())
 
     def test_basic_generator_error(self):
-        self.assertRaises(ValueError, generator.Generator, device="CUDA")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.assertRaises(
+                ValueError, generator.Generator, place=paddle.CUDAPlace(0))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
index 4f0beb8c0dcd5384e7b9f6e30e8082595ac4dc06..7c1ff41f7e7674936f2725dae3ea19f399cf51e4 100644
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
@@ -35,10 +35,10 @@ def random_reader():
 
 
 def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
+    paddle.manual_seed(1)
+    paddle.framework.random._manual_program_seed(1)
     startup_prog = fluid.Program()
     main_prog = fluid.Program()
-    startup_prog.random_seed = 1
-    main_prog.random_seed = 1
 
     with fluid.unique_name.guard():
         with fluid.program_guard(main_prog, startup_prog):
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 5777bb3c6f5e34f035c32ed963906b5ccc03ba85..5c9867e681524f519e267fb744fc4090c836036a 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
@@ -266,8 +267,8 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
 
     def training_test(self, is_sparse):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
+            paddle.manual_seed(1)
             start_up = fluid.default_startup_program()
-            start_up.random_seed = 1  # Fix random seed
             x = np.arange(6).reshape(6)
             path_table = np.array([(1, 2, -1), (1, 2, -1)]).astype('int64')
             path_code = np.array([(1, 0, -1), (0, 0, -1)]).astype('int64')
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index a4f3858d6fb242b8689bd1d300861faf8ed73e54..fdf7adbfb45f0a6133909f4a7d6b488cae09144c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -121,6 +121,7 @@ class TestAmpScaler(unittest.TestCase):
 
         def run_simple_conv(inp_np, use_scaler=True):
             paddle.manual_seed(10)
+            paddle.framework.random._manual_program_seed(10)
             with fluid.dygraph.guard():
                 model = SimpleConv(
                     num_channels=3,
@@ -204,6 +205,7 @@ class TestResnet(unittest.TestCase):
 
         with fluid.dygraph.guard():
             paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet(use_cudnn=True)
             optimizer = optimizer_setting(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index 2a25bf6f8abade11d9ad25894753f6d17066e7fd..837e82882e9df8f50ca83a5df20ddf0f03ee504b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -238,8 +238,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
             out2 = linear2(b)
             out1.stop_gradient = True
             out = fluid.layers.concat(input=[out1, out2, c], axis=1)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            out.backward(backward_strategy)
+            out.backward()
             self.assertTrue(linear.weight.gradient() is None)
             self.assertTrue(out1.gradient() is None)
 
@@ -311,9 +310,8 @@ class TestImperativeAutoPrune(unittest.TestCase):
             out2 = linear2(b)
             out1.stop_gradient = True
             out = fluid.layers.concat(input=[out1, out2, c], axis=1)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            out.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            out.backward()
             self.assertTrue(linear.weight.gradient() is None)
             self.assertTrue(out1.gradient() is None)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index f83f8ef35215e5a0199c4d63744882126212b928..b74182d27ab8c89cc43d3fc1656ca13916d159c1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -314,9 +314,8 @@ class TestImperative(unittest.TestCase):
                 inputs2.append(tmp)
             ret2 = fluid.layers.sums(inputs2)
             loss2 = fluid.layers.reduce_sum(ret2)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            loss2.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            loss2.backward()
 
             self.assertTrue(np.allclose(ret.numpy(), x * 10))
             self.assertTrue(np.allclose(inputs[0].gradient(), x))
@@ -403,9 +402,8 @@ class TestImperative(unittest.TestCase):
             x2 = l2(var_inp2)[0]
             self.assertIsNotNone(x2)
             dy_out2 = x2.numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            x2.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            x2.backward()
             dy_grad2 = l2._x_for_debug.gradient()
 
         with new_program_scope():
@@ -442,9 +440,8 @@ class TestImperative(unittest.TestCase):
             mlp2 = MLP(input_size=2)
             out2 = mlp2(var_inp2)
             dy_out2 = out2.numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            out2.backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            out2.backward()
             dy_grad2 = mlp2._linear1.weight.gradient()
 
         with new_program_scope():
@@ -552,9 +549,8 @@ class TestImperative(unittest.TestCase):
             simple_rnn2 = SimpleRNN()
             outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2)
             dy_out2 = outs2[3].numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            outs2[3].backward(backward_strategy)
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            outs2[3].backward()
             dy_grad_h2o2 = simple_rnn2._cell._h2o_w.gradient()
             dy_grad_h2h2 = simple_rnn2._cell._h2h_w.gradient()
             dy_grad_i2h2 = simple_rnn2._cell._i2h_w.gradient()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
index d3f488d92ac455072b37274e2ce782bcf41e8cc7..428f97c0af8182efdaab31dbd720e523578f2292 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
@@ -43,7 +43,7 @@ class MLP(fluid.Layer):
 class TestDataParallelStateDict(unittest.TestCase):
     def test_data_parallel_state_dict(self):
         with fluid.dygraph.guard():
-            strategy = paddle.prepare_context()
+            strategy = paddle.distributed.prepare_context()
             mlp = MLP()
             parallel_mlp = dygraph.parallel.DataParallel(mlp, strategy)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index f76c3bd958081070939a85c390eeaeaa389ad5a4..cc6c2f97a9334bf0c3932ecc1fcc18b0b56b2797 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -206,11 +206,10 @@ class TestDygraphDeepCF(unittest.TestCase):
         else:
             (users_np, items_np, labels_np, num_users, num_items,
              matrix) = get_data()
-
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
         startup = fluid.Program()
-        startup.random_seed = seed
         main = fluid.Program()
-        main.random_seed = seed
 
         scope = fluid.core.Scope()
         with new_program_scope(main=main, startup=startup, scope=scope):
@@ -244,8 +243,8 @@ class TestDygraphDeepCF(unittest.TestCase):
                     sys.stderr.write('static loss %s\n' % static_loss)
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             deepcf = DeepCF(num_users, num_items, matrix)
             adam = fluid.optimizer.AdamOptimizer(
@@ -269,14 +268,13 @@ class TestDygraphDeepCF(unittest.TestCase):
                     sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             deepcf2 = DeepCF(num_users, num_items, matrix)
             adam2 = fluid.optimizer.AdamOptimizer(
                 0.01, parameter_list=deepcf2.parameters())
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             for e in range(NUM_EPOCHES):
                 sys.stderr.write('epoch %d\n' % e)
                 for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
@@ -289,7 +287,7 @@ class TestDygraphDeepCF(unittest.TestCase):
                         fluid.layers.log_loss(prediction2,
                                               to_variable(labels_np[
                                                   slice:slice + BATCH_SIZE])))
-                    loss2.backward(backward_strategy)
+                    loss2.backward()
                     adam2.minimize(loss2)
                     deepcf2.clear_gradients()
                     dy_loss2 = loss2.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index 429736803a192a7cdf01522406f95f8e7c892390..720c9f95c251ec54c7e7fa74c8e59e135a8c6be7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -52,8 +52,7 @@ class TestDygraphDoubleGrad(TestCase):
              retain_graph=None,
              create_graph=False,
              allow_unused=False):
-        backward_strategy = fluid.dygraph.BackwardStrategy()
-        backward_strategy.sort_sum_gradient = self.sort_sum_gradient
+        fluid.set_flags({'FLAGS_sort_sum_gradient': self.sort_sum_gradient})
         return fluid.dygraph.grad(
             outputs=outputs,
             inputs=inputs,
@@ -61,8 +60,7 @@ class TestDygraphDoubleGrad(TestCase):
             no_grad_vars=no_grad_vars,
             retain_graph=retain_graph,
             create_graph=create_graph,
-            allow_unused=allow_unused,
-            backward_strategy=backward_strategy)
+            allow_unused=allow_unused)
 
     @dygraph_guard
     def test_exception(self):
@@ -310,10 +308,11 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
                     out = out + linear(input)
             return out
 
-        backward_strategy = fluid.dygraph.BackwardStrategy()
-        backward_strategy.sort_sum_gradient = True
+        fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+
         with fluid.dygraph.guard():
             paddle.manual_seed(123)
+            paddle.framework.random._manual_program_seed(123)
             a = fluid.dygraph.to_variable(value)
             a.stop_gradient = False
 
@@ -324,18 +323,18 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
                 inputs=[a],
                 create_graph=False,
                 only_inputs=True,
-                allow_unused=False,
-                backward_strategy=backward_strategy)
+                allow_unused=False)
 
             grad_1 = dx[0].numpy()
 
         with fluid.dygraph.guard():
             paddle.manual_seed(123)
+            paddle.framework.random._manual_program_seed(123)
             a = fluid.dygraph.to_variable(value)
             a.stop_gradient = False
 
             out = model_f(a)
-            out.backward(backward_strategy)
+            out.backward()
 
             grad_2 = a.gradient()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index b7ebd23a0b74208e768ea4e67b69dc4a596c6764..b752b439f0fa945f75c8cb0c1478668e3dd2c6d5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -56,13 +56,11 @@ class Generator(fluid.Layer):
 class TestDygraphGAN(unittest.TestCase):
     def test_gan_float32(self):
         seed = 90
-
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         startup = fluid.Program()
-        startup.random_seed = seed
         discriminate_p = fluid.Program()
         generate_p = fluid.Program()
-        discriminate_p.random_seed = seed
-        generate_p.random_seed = seed
 
         scope = fluid.core.Scope()
         with new_program_scope(
@@ -133,8 +131,8 @@ class TestDygraphGAN(unittest.TestCase):
 
         dy_params = dict()
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
 
             discriminator = Discriminator()
             generator = Generator()
@@ -177,11 +175,9 @@ class TestDygraphGAN(unittest.TestCase):
 
         dy_params2 = dict()
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
             discriminator2 = Discriminator()
             generator2 = Generator()
             sgd2 = SGDOptimizer(
@@ -201,7 +197,7 @@ class TestDygraphGAN(unittest.TestCase):
                     x=d_fake2, label=to_variable(np.zeros([2, 1], np.float32))))
 
             d_loss2 = d_loss_real2 + d_loss_fake2
-            d_loss2.backward(backward_strategy)
+            d_loss2.backward()
             sgd2.minimize(d_loss2)
             discriminator2.clear_gradients()
             generator2.clear_gradients()
@@ -211,7 +207,7 @@ class TestDygraphGAN(unittest.TestCase):
             g_loss2 = fluid.layers.reduce_mean(
                 fluid.layers.sigmoid_cross_entropy_with_logits(
                     x=d_fake2, label=to_variable(np.ones([2, 1], np.float32))))
-            g_loss2.backward(backward_strategy)
+            g_loss2.backward()
             sgd2.minimize(g_loss2)
             for p in discriminator2.parameters():
                 dy_params2[p.name] = p.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index 01f3c02774698376c576a446f32634583623a737..4db6f2d0da1d5287a3c9ccca7c5459e5915a514f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -61,12 +61,10 @@ class GCN(fluid.Layer):
 
 class TestDygraphGNN(unittest.TestCase):
     def test_gnn_float32(self):
-        seed = 90
-
+        paddle.manual_seed(90)
+        paddle.framework.random._manual_program_seed(90)
         startup = fluid.Program()
-        startup.random_seed = seed
         main = fluid.Program()
-        main.random_seed = seed
 
         scope = fluid.core.Scope()
         with new_program_scope(main=main, startup=startup, scope=scope):
@@ -114,8 +112,8 @@ class TestDygraphGNN(unittest.TestCase):
                 scope.find_var(model.gc.weight.name).get_tensor())
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(90)
+            paddle.framework.random._manual_program_seed(90)
 
             features = np.ones([1, 100, 50], dtype=np.float32)
             # Use selected rows when it's supported.
@@ -140,8 +138,8 @@ class TestDygraphGNN(unittest.TestCase):
             model_gc_weight_value = model.gc.weight.numpy()
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(90)
+            paddle.framework.random._manual_program_seed(90)
 
             features2 = np.ones([1, 100, 50], dtype=np.float32)
             # Use selected rows when it's supported.
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
index 4fe4d963ca5ee4cff1e7073d11361de69e68aa9f..317353684317f6fa0e8cf37cda58f2041e70befd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
@@ -62,8 +62,7 @@ class Test_Forward_Hook(unittest.TestCase):
             with fluid.dygraph.guard(place):
                 fluid.default_startup_program().random_seed = seed
                 fluid.default_main_program().random_seed = seed
-                backward_strategy = fluid.dygraph.BackwardStrategy()
-                backward_strategy.sort_sum_gradient = True
+                fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
                 input_word = np.array(
                     [0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7,
@@ -132,8 +131,7 @@ class Test_Forward_Hook(unittest.TestCase):
             with fluid.dygraph.guard(place):
                 fluid.default_startup_program().random_seed = seed
                 fluid.default_main_program().random_seed = seed
-                backward_strategy = fluid.dygraph.BackwardStrategy()
-                backward_strategy.sort_sum_gradient = True
+                fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
                 global call_forward_hook
                 global call_forward_pre_hook
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
index b15ad911ee79d47011be6eaa4bde62ba71c55c0e..f61d1ab888a51b2ebe4d1205b30fb84dfa4e7aeb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
@@ -40,9 +40,8 @@ class LeNetDygraph(fluid.dygraph.Layer):
         if num_classes > 0:
             self.fc = nn.Sequential(
                 nn.Linear(400, 120),
-                nn.Linear(120, 84),
-                nn.Linear(
-                    84, 10, act=classifier_activation))
+                nn.Linear(120, 84), nn.Linear(84, 10),
+                nn.Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index 69fd7d80327f1a666870dc76e041449366565b01..f0fea2d7eb75cff376ebce3505e175030619697c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -94,8 +95,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
 
             for is_sort_sum_gradient in [True, False]:
                 with fluid.dygraph.guard(place):
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
@@ -113,8 +114,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_loss = None
 
                     helper = DyGraphProgramDescTracerTestHelper(self)
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = is_sort_sum_gradient
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': is_sort_sum_gradient
+                    })
 
                     for i in range(batch_num):
                         x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -129,7 +131,7 @@ class TestDygraphSimpleNet(unittest.TestCase):
                         if i == 0:
                             for param in simple_net.parameters():
                                 dy_param_init[param.name] = param.numpy()
-                        dy_loss.backward(backward_strategy)
+                        dy_loss.backward()
                         sgd.minimize(dy_loss)
                         sgd.clear_gradients()
                         if i == batch_num - 1:
@@ -138,8 +140,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_loss_value = dy_loss.numpy()
 
                 with new_program_scope():
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
index 4ce0ca350ddb9e8b9873a1650eefa1d5b2db4938..bda1958c0f3544bef51e51cf418ae6c07bdd7056 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
@@ -36,8 +36,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
             mnist2 = MNIST()
             sgd2 = SGDOptimizer(
@@ -69,7 +68,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
                         for param in mnist2.parameters():
                             dy_param_init_value2[param.name] = param.numpy()
 
-                    avg_loss2.backward(backward_strategy)
+                    avg_loss2.backward()
                     sgd2.minimize(avg_loss2)
                     mnist2.clear_gradients()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 246b013f1ada6bc853711e146379b8bb2df5e363..5400b785d2929b4ff8614d4a6dbe26f197bf5ad1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import six
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit
@@ -401,10 +402,9 @@ class TestDygraphOCRAttention(unittest.TestCase):
                 dtype='int64').reshape([1, Config.max_length])))
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             ocr_attention = OCRAttention()
 
             if Config.learning_rate_decay == "piecewise_decay":
@@ -438,7 +438,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
                         for param in ocr_attention.parameters():
                             if param.name not in dy_param_init_value:
                                 dy_param_init_value[param.name] = param.numpy()
-                    avg_loss.backward(backward_strategy)
+                    avg_loss.backward()
                     dy_grad_value = {}
                     for param in ocr_attention.parameters():
                         if param.trainable:
@@ -454,8 +454,8 @@ class TestDygraphOCRAttention(unittest.TestCase):
                         dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             ocr_attention = OCRAttention()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index a7783afc5cff3da97b623aec3297881013724a78..7876675bcc6a1cb5ea190adfa16fb5e4de8c2e35 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -74,8 +74,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
 
         with fluid.dygraph.guard(place):
             try:
-                fluid.default_startup_program().random_seed = seed
-                fluid.default_main_program().random_seed = seed
+                paddle.manual_seed(seed)
+                paddle.framework.random._manual_program_seed(seed)
                 mlp = MLP()
                 optimizer = self.get_optimizer_dygraph(
                     parameter_list=mlp.parameters())
@@ -91,8 +91,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
             ) else fluid.CUDAPlace(0)
 
         with fluid.dygraph.guard(place):
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             mlp = MLP()
             optimizer = self.get_optimizer_dygraph(
@@ -132,8 +132,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             if place == None:
                 place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
index 9f75c92b185ed338eca15cab1b624da97b1fda33..619e9e8e90783365b5f0d718783a14468520c8d4 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -74,8 +74,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
 
         with fluid.dygraph.guard(place):
             try:
-                fluid.default_startup_program().random_seed = seed
-                fluid.default_main_program().random_seed = seed
+                paddle.manual_seed(seed)
+                paddle.framework.random._manual_program_seed(seed)
                 mlp = MLP()
                 optimizer = self.get_optimizer_dygraph(
                     parameter_list=mlp.parameters())
@@ -91,8 +91,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
             ) else fluid.CUDAPlace(0)
 
         with fluid.dygraph.guard(place):
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             mlp = MLP()
             optimizer = self.get_optimizer_dygraph(
@@ -132,8 +132,8 @@ class TestImperativeOptimizerBase(unittest.TestCase):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             if place == None:
                 place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
@@ -200,7 +200,7 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
         bd = [3, 6, 9]
         optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.piecewise_decay(
+            learning_rate=paddle.optimizer.PiecewiseLR(
                 boundaries=bd,
                 values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
             parameter_list=parameter_list)
@@ -208,7 +208,7 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
 
     def get_optimizer(self):
         bd = [3, 6, 9]
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
+        optimizer = SGDOptimizer(learning_rate=paddle.optimizer.PiecewiseLR(
             boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
         return optimizer
 
@@ -381,9 +381,9 @@ class TestOptimizerLearningRate(unittest.TestCase):
             bd = [2, 4, 6, 8]
             value = [0.2, 0.4, 0.6, 0.8, 1.0]
 
+            scheduler = paddle.optimizer.PiecewiseLR(bd, value)
             adam = paddle.optimizer.Adam(
-                fluid.dygraph.PiecewiseDecay(bd, value, 0),
-                parameters=linear.parameters())
+                scheduler, parameters=linear.parameters())
 
             self.assertTrue(
                 np.allclose(
@@ -393,8 +393,8 @@ class TestOptimizerLearningRate(unittest.TestCase):
             for i in range(12):
                 adam.minimize(loss)
                 lr = adam.get_lr()
-
                 self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
+                scheduler.step()
 
     def test_lr_decay_natural_exp(self):
         with fluid.dygraph.guard():
@@ -409,24 +409,21 @@ class TestOptimizerLearningRate(unittest.TestCase):
             loss = fluid.layers.reduce_mean(b)
             base_lr = 1.0
 
+            scheduler = paddle.optimizer.NaturalExpLR(1.0, gamma=0.5)
+            print("scheduler.last_lr", scheduler.last_lr)
             adam = paddle.optimizer.Adam(
-                fluid.dygraph.NaturalExpDecay(
-                    learning_rate=base_lr,
-                    decay_steps=3,
-                    decay_rate=0.5,
-                    staircase=True),
-                parameters=linear.parameters())
+                scheduler, parameters=linear.parameters())
 
             self.assertTrue(
                 np.allclose(
                     adam.get_lr(), 1.0, rtol=1e-06, atol=0.0))
 
-            ret = [1.0, 1.0, 1.0, np.exp(-0.5), np.exp(-0.5)]
-            for i in range(5):
+            ret = [1.0, np.exp(-0.5), np.exp(-1)]
+            for i in range(3):
                 adam.minimize(loss)
                 lr = adam.get_lr()
-
                 self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
+                scheduler.step()
 
     def test_set_lr(self):
         with fluid.dygraph.guard():
@@ -451,20 +448,15 @@ class TestOptimizerLearningRate(unittest.TestCase):
                     np.allclose(
                         lr, lr_list[i], rtol=1e-06, atol=0.0))
 
-            lr_var = fluid.layers.create_global_var(
-                shape=[1], value=0.7, dtype='float32')
-            adam.set_lr(lr_var)
-            adam.minimize(loss)
-            lr = adam.get_lr()
-            self.assertTrue(np.allclose(lr, 0.7, rtol=1e-06, atol=0.0))
+            with self.assertRaises(TypeError):
+                lr_var = fluid.layers.create_global_var(
+                    shape=[1], value=0.7, dtype='float32')
+                adam.set_lr(lr_var)
 
             with self.assertRaises(RuntimeError):
                 adam = paddle.optimizer.Adam(
-                    fluid.dygraph.NaturalExpDecay(
-                        learning_rate=0.1,
-                        decay_steps=3,
-                        decay_rate=0.5,
-                        staircase=True),
+                    paddle.optimizer.NaturalExpLR(
+                        learning_rate=0.1, gamma=0.5),
                     parameters=linear.parameters())
                 adam.set_lr(0.01)
 
@@ -666,7 +658,7 @@ class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
 class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
         optimizer = PipelineOptimizer(optimizer)
         return optimizer
 
@@ -678,7 +670,7 @@ class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
 class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
         optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5)
         return optimizer
 
@@ -690,7 +682,7 @@ class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
 class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
-                                         parameter_list=parameter_list)
+                                         parameters=parameter_list)
         optimizer = RecomputeOptimizer(optimizer)
         return optimizer
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index bd629f5f4a69a9a8c94f1b2cc58935f0e991ead0..fa23ff8e7c29fa5a07cab03f7407910f687ce9ee 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -225,8 +226,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         traced_layer = None
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -293,8 +294,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             dy_last_hidden_value = last_hidden.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
index 8e85fe5dfefea3221fe0566ac506b1277263eec2..0487f8dd9a640b7d337dbc603030e6dd6bbe7ef7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -43,10 +44,10 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -82,7 +83,7 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
-                dy_loss.backward(backward_strategy)
+                dy_loss.backward()
                 sgd.minimize(dy_loss)
                 ptb_model.clear_gradients()
                 if i == batch_num - 1:
@@ -94,8 +95,9 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
             dy_last_hidden_value = last_hidden.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
index 735ec4d3f1ea869a17bea4efdba9e5dcedb39fb6..0076c61e584074ed091b9b0c80e9aa5be00e48fb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -64,8 +64,8 @@ class TestImperativeMnist(unittest.TestCase):
         mask = np.array(mask_list).astype("float32")
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             policy = Policy(input_size=4)
 
@@ -105,8 +105,8 @@ class TestImperativeMnist(unittest.TestCase):
                 dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 815437072fde291b8d8348dba0b4b0ae872ec1b9..e8a2298c17d001abeac2f113df08ee01b94c9422 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -251,8 +251,8 @@ class TestDygraphResnet(unittest.TestCase):
         traced_layer = None
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet()
             optimizer = optimizer_setting(
@@ -334,8 +334,8 @@ class TestDygraphResnet(unittest.TestCase):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
index 8cbd08ea3e245f70a6a4aceb3f6c9e0b83356981..13b12da3318cad709b1978dc581ff479a1d842c6 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
@@ -77,10 +77,10 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
         batch_size = train_parameters["batch_size"]
         batch_num = 10
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+
             resnet = ResNet()
             optimizer = optimizer_setting(
                 train_parameters, parameter_list=resnet.parameters())
@@ -119,7 +119,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
                         if param.name not in dy_param_init_value:
                             dy_param_init_value[param.name] = param.numpy()
 
-                avg_loss.backward(backward_strategy)
+                avg_loss.backward()
 
                 dy_grad_value = {}
                 for param in resnet.parameters():
@@ -137,8 +137,8 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index eb9dc926c8207f4de4a6ce7e3d0dc89cc2b965fd..48aea3a584dd25667704b22d99d1074c481bb76c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -219,8 +219,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -305,8 +305,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -374,6 +374,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 adam._learning_rate.step_num = 0
 
             para_state_dict, opti_state_dict = paddle.load("./test_dy")
+            print(opti_state_dict['LR_Scheduler'])
             adam.set_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
@@ -414,8 +415,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -521,8 +522,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -634,8 +635,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -713,8 +712,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -804,9 +803,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
+
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 4ab35a21aff43af822821c14007fbdd69a081803..e81d1c8610f6bebffadf930b67dc14a4a418ef05 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -219,8 +219,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -239,10 +239,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
             adam = Adam(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr_arr),
-                parameters=ptb_model.parameters())
+                learning_rate=scheduler, parameters=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -268,7 +268,9 @@ class TestDygraphPtbRnn(unittest.TestCase):
                         dy_param_init[param.name] = param.numpy()
                 dy_loss.backward()
                 adam.minimize(dy_loss)
+                scheduler.step()
                 ptb_model.clear_gradients()
+
                 if i == batch_num - 1:
                     for param in ptb_model.parameters():
                         dy_param_updated[param.name] = param.numpy()
@@ -283,7 +285,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 else:
                     self.base_opti[k] = v
 
-            fluid.save_dygraph(self.opti_dict, "./test_dy")
+            fluid.save_dygraph(self.opti_dict, "./test_dy_v2")
 
             self.state_dict = ptb_model.state_dict()
 
@@ -292,7 +294,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 np_t = v.numpy()
                 self.model_base[k] = np_t
 
-            paddle.save(self.state_dict, "./test_dy")
+            paddle.save(self.state_dict, "./test_dy_v2")
 
     def testLoadAndSetVarBase(self):
         seed = 90
@@ -305,8 +307,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -325,10 +327,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
             adam = Adam(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr_arr),
-                parameters=ptb_model.parameters())
+                learning_rate=scheduler, parameters=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -354,6 +356,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                         dy_param_init[param.name] = param.numpy()
                 dy_loss.backward()
                 adam.minimize(dy_loss)
+                scheduler.step()
                 ptb_model.clear_gradients()
                 if i == batch_num - 1:
                     for param in ptb_model.parameters():
@@ -370,10 +373,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                     self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
 
-            if isinstance(adam._learning_rate, LearningRateDecay):
-                adam._learning_rate.step_num = 0
-
-            para_state_dict, opti_state_dict = paddle.load("./test_dy")
+            para_state_dict, opti_state_dict = paddle.load("./test_dy_v2")
             adam.set_state_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
@@ -414,8 +414,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -434,10 +434,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
             adam = Adam(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr_arr),
-                parameters=ptb_model.parameters())
+                learning_rate=scheduler, parameters=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -463,6 +463,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                         dy_param_init[param.name] = param.numpy()
                 dy_loss.backward()
                 adam.minimize(dy_loss)
+                scheduler.step()
                 ptb_model.clear_gradients()
                 if i == batch_num - 1:
                     for param in ptb_model.parameters():
@@ -521,8 +522,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -541,10 +542,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
             adam = Adam(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr_arr),
-                parameters=ptb_model.parameters())
+                learning_rate=scheduler, parameters=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -570,6 +571,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                         dy_param_init[param.name] = param.numpy()
                 dy_loss.backward()
                 adam.minimize(dy_loss)
+                scheduler.step()
                 ptb_model.clear_gradients()
                 if i == batch_num - 1:
                     for param in ptb_model.parameters():
@@ -634,8 +636,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -713,8 +715,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -745,7 +747,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             last_hidden = None
             last_cell = None
 
-            state_dict, opti_dict = fluid.load_dygraph("./test_dy")
+            state_dict, opti_dict = fluid.load_dygraph("./test_dy_v2")
             adam.set_state_dict(opti_dict)
             ptb_model.set_dict(state_dict)
 
@@ -804,8 +806,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
@@ -825,9 +827,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.PiecewiseLR(
+                boundaries=bd, values=lr_arr)
             adam = Adam(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr_arr),
+                learning_rate=scheduler,
                 beta1=0.8,
                 beta2=0.6,
                 parameters=ptb_model.parameters())
@@ -867,14 +870,16 @@ class TestDygraphPtbRnn(unittest.TestCase):
                                                             init_cell)
 
                 dy_loss.backward()
+                scheduler.step()
                 adam.minimize(dy_loss)
                 ptb_model.clear_gradients()
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if k == "global_step":
+                if k == "LR_Scheduler":
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] + 1))
+                        np.array_equal(v['last_epoch'], self.base_opti[k][
+                            'last_epoch'] + 1))
 
                 if k.find("beta1_pow_acc_0") > 0:
                     self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 283addaf6283a5365d983e4737bf2a8fdf5ee0b9..a04e1e4e5aafeeb605348b30125c5d42b3171674 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -308,8 +308,8 @@ class TestImperativeResneXt(unittest.TestCase):
         batch_num = 1
         epoch_num = 1
         with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             se_resnext = SeResNeXt()
             optimizer = optimizer_setting(
@@ -367,8 +367,8 @@ class TestImperativeResneXt(unittest.TestCase):
                         dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
index 9878e2f9ad772fe3d03addb4ced9f3b66a6cd58a..59ddb365e539603c1eba06ca8828fc244b6e542d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -48,8 +48,9 @@ class TestSimpleNet(unittest.TestCase):
             for dtype in ["float32", "float64"]:
                 for sort_sum_gradient in [True, False]:
                     paddle.disable_static(place)
-                    backward_strategy = paddle.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = sort_sum_gradient
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': sort_sum_gradient
+                    })
                     # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
@@ -65,7 +66,7 @@ class TestSimpleNet(unittest.TestCase):
                     self.assertTrue(emb.weight.gradient() is None)
                     self.assertTrue(input_emb.gradient() is None)
 
-                    input_emb.backward(backward_strategy)
+                    input_emb.backward()
                     adam.minimize(input_emb)
                     self.assertTrue(emb.weight.gradient() is not None)
 
@@ -84,8 +85,9 @@ class TestSimpleNet(unittest.TestCase):
         for place in places:
             for sort_sum_gradient in [True, False]:
                 with fluid.dygraph.guard(place):
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = sort_sum_gradient
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': sort_sum_gradient
+                    })
                     grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
@@ -101,7 +103,7 @@ class TestSimpleNet(unittest.TestCase):
                     self.assertTrue(emb.weight.gradient() is None)
                     self.assertTrue(input_emb.gradient() is None)
 
-                    input_emb.backward(backward_strategy)
+                    input_emb.backward()
                     adam.minimize(input_emb)
                     self.assertTrue(emb.weight.gradient() is not None)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index a42a62019ba54a771d26ad853e39fcf8ca991180..794f59e48507e6002311e54e8ae31f3ad1bf4647 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
@@ -101,8 +102,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
             for is_sort_sum_gradient in [True, False]:
                 traced_layer = None
                 with fluid.dygraph.guard(place):
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
@@ -119,8 +120,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_param_init = dict()
                     dy_loss = None
 
-                    backward_strategy = fluid.dygraph.BackwardStrategy()
-                    backward_strategy.sort_sum_gradient = is_sort_sum_gradient
+                    fluid.set_flags({
+                        'FLAGS_sort_sum_gradient': is_sort_sum_gradient
+                    })
 
                     for i in range(batch_num):
                         x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -135,7 +137,7 @@ class TestDygraphSimpleNet(unittest.TestCase):
                         if i == 0:
                             for param in simple_net.parameters():
                                 dy_param_init[param.name] = param.numpy()
-                        dy_loss.backward(backward_strategy)
+                        dy_loss.backward()
                         sgd.minimize(dy_loss)
                         sgd.clear_gradients()
                         if i == batch_num - 1:
@@ -144,8 +146,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
                     dy_loss_value = dy_loss.numpy()
 
                 with new_program_scope():
-                    fluid.default_startup_program().random_seed = seed
-                    fluid.default_main_program().random_seed = seed
+                    paddle.manual_seed(seed)
+                    paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
                         hidden_size=hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index 649dc1ad91d3878dacc551fd08527885c3f479aa..e94157fa047eef065bc4bd0bfb3d6b6c778ea7b9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -468,8 +468,8 @@ def build_optimizer(layer, cfg, loss=None):
 
 class DyGraphTrainModel(object):
     def __init__(self, cfg):
-        fluid.default_startup_program().random_seed = cfg.seed
-        fluid.default_main_program().random_seed = cfg.seed
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
 
         self.generator = Generator(cfg)
         self.discriminator = Discriminator(cfg)
@@ -479,8 +479,7 @@ class DyGraphTrainModel(object):
 
         self.cfg = cfg
 
-        self.backward_strategy = fluid.dygraph.BackwardStrategy()
-        self.backward_strategy.sort_sum_gradient = cfg.sort_sum_gradient
+        fluid.set_flags({'FLAGS_sort_sum_gradient': cfg.sort_sum_gradient})
 
     def clear_gradients(self):
         if self.g_optimizer:
@@ -497,7 +496,7 @@ class DyGraphTrainModel(object):
         g_loss = get_generator_loss(image_real, label_org, label_trg,
                                     self.generator, self.discriminator,
                                     self.cfg)
-        g_loss.backward(self.backward_strategy)
+        g_loss.backward()
         if self.g_optimizer:
             self.g_optimizer.minimize(g_loss)
 
@@ -506,7 +505,7 @@ class DyGraphTrainModel(object):
         d_loss = get_discriminator_loss(image_real, label_org, label_trg,
                                         self.generator, self.discriminator,
                                         self.cfg)
-        d_loss.backward(self.backward_strategy)
+        d_loss.backward()
         if self.d_optimizer:
             self.d_optimizer.minimize(d_loss)
 
@@ -530,12 +529,12 @@ class StaticGraphTrainModel(object):
                 shape=[None, cfg.c_dim], dtype='float32', name='label_trg')
             return image_real, label_org, label_trg
 
+        paddle.manual_seed(cfg.seed)
+        paddle.framework.random._manual_program_seed(cfg.seed)
         self.gen_program = fluid.Program()
         gen_startup_program = fluid.Program()
 
         with fluid.program_guard(self.gen_program, gen_startup_program):
-            self.gen_program.random_seed = cfg.seed
-            gen_startup_program.random_seed = cfg.seed
             with fluid.unique_name.guard():
                 image_real, label_org, label_trg = create_data_layer()
                 generator = Generator(cfg)
@@ -547,8 +546,6 @@ class StaticGraphTrainModel(object):
         self.dis_program = fluid.Program()
         dis_startup_program = fluid.Program()
         with fluid.program_guard(self.dis_program, dis_startup_program):
-            self.dis_program.random_seed = cfg.seed
-            dis_startup_program.random_seed = cfg.seed
             with fluid.unique_name.guard():
                 image_real, label_org, label_trg = create_data_layer()
                 generator = Generator(cfg)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
index acc56b7db27f48cad92ed44cddfcfd4b9591dba3..f10d2df7f06f98334df62d3021403d686054b7d9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
@@ -121,8 +121,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
         with fluid.dygraph.guard(place):
             fluid.default_startup_program().random_seed = self.seed
             fluid.default_main_program().random_seed = self.seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
             mnist = fluid.dygraph.static_runner.StaticModelRunner(
                 model_dir=self.save_dirname,
@@ -156,7 +155,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
                     loss = fluid.layers.cross_entropy(cost, label)
                     avg_loss = fluid.layers.mean(loss)
 
-                    avg_loss.backward(backward_strategy)
+                    avg_loss.backward()
                     sgd.minimize(avg_loss)
                     mnist.clear_gradients()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
index 0792582175ef03cba3d3ba809132f3c591ecfe87..db47170c7bfff4575a9b4dcf694cd8ed722b0b8f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
@@ -111,9 +111,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
             fluid.default_startup_program().random_seed = self.seed
             fluid.default_main_program().random_seed = self.seed
             np.random.seed(self.seed)
-
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
             while_net = fluid.dygraph.static_runner.StaticModelRunner(
                 self.save_dirname)
@@ -141,7 +139,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
                 loss = fluid.layers.cross_entropy(cost, label)
                 avg_loss = fluid.layers.mean(loss)
 
-                avg_loss.backward(backward_strategy)
+                avg_loss.backward()
                 sgd.minimize(avg_loss)
                 while_net.clear_gradients()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index 29cc718f14ff98de2b668d313d380d784cbaa6ef..9f58ef881e4e47365be03cdb5786ac292c938f03 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Embedding, LayerNorm, Linear, Layer
 from paddle.fluid.dygraph import to_variable, guard
@@ -949,10 +950,9 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
         seed = 90
 
         with guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             transformer = TransFormer(
                 ModelHyperParams.src_vocab_size,
                 ModelHyperParams.trg_vocab_size,
@@ -1021,7 +1021,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
                     for param in transformer.parameters():
                         dy_param_init[param.name] = param.numpy()
 
-                dy_avg_cost.backward(backward_strategy)
+                dy_avg_cost.backward()
                 optimizer.minimize(dy_avg_cost)
                 transformer.clear_gradients()
 
@@ -1035,8 +1035,8 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
             dy_token_num_value = dy_token_num.numpy()
 
         with new_program_scope():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
             transformer = TransFormer(
                 ModelHyperParams.src_vocab_size,
                 ModelHyperParams.trg_vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_input_spec.py b/python/paddle/fluid/tests/unittests/test_input_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..e329a37488a2cb8234532cd0a9beb7a1a25e72a6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_input_spec.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from paddle.static import InputSpec
+from paddle.fluid.framework import core, convert_np_dtype_to_dtype_
+
+
+class TestInputSpec(unittest.TestCase):
+    def test_default(self):
+        tensor_spec = InputSpec([3, 4])
+        self.assertEqual(tensor_spec.dtype,
+                         convert_np_dtype_to_dtype_('float32'))
+        self.assertEqual(tensor_spec.name, None)
+
+    def test_from_tensor(self):
+        x_bool = fluid.layers.fill_constant(shape=[1], dtype='bool', value=True)
+        bool_spec = InputSpec.from_tensor(x_bool)
+        self.assertEqual(bool_spec.dtype, x_bool.dtype)
+        self.assertEqual(bool_spec.shape, x_bool.shape)
+        self.assertEqual(bool_spec.name, x_bool.name)
+
+        bool_spec2 = InputSpec.from_tensor(x_bool, name='bool_spec')
+        self.assertEqual(bool_spec2.name, bool_spec2.name)
+
+    def test_from_numpy(self):
+        x_numpy = np.ones([10, 12])
+        x_np_spec = InputSpec.from_numpy(x_numpy)
+        self.assertEqual(x_np_spec.dtype,
+                         convert_np_dtype_to_dtype_(x_numpy.dtype))
+        self.assertEqual(x_np_spec.shape, x_numpy.shape)
+        self.assertEqual(x_np_spec.name, None)
+
+        x_numpy2 = np.array([1, 2, 3, 4]).astype('int64')
+        x_np_spec2 = InputSpec.from_numpy(x_numpy2, name='x_np_int64')
+        self.assertEqual(x_np_spec2.dtype,
+                         convert_np_dtype_to_dtype_(x_numpy2.dtype))
+        self.assertEqual(x_np_spec2.shape, x_numpy2.shape)
+        self.assertEqual(x_np_spec2.name, 'x_np_int64')
+
+    def test_shape_with_none(self):
+        tensor_spec = InputSpec([None, 4, None], dtype='int8', name='x_spec')
+        self.assertEqual(tensor_spec.dtype, convert_np_dtype_to_dtype_('int8'))
+        self.assertEqual(tensor_spec.name, 'x_spec')
+        self.assertEqual(tensor_spec.shape, (-1, 4, -1))
+
+    def test_shape_raise_error(self):
+        # 1. shape should only contain int and None.
+        with self.assertRaises(ValueError):
+            tensor_spec = InputSpec(['None', 4, None], dtype='int8')
+
+        # 2. shape should be type `list` or `tuple`
+        with self.assertRaises(TypeError):
+            tensor_spec = InputSpec(4, dtype='int8')
+
+        # 3. len(shape) should be greater than 0.
+        with self.assertRaises(ValueError):
+            tensor_spec = InputSpec([], dtype='int8')
+
+    def test_batch_and_unbatch(self):
+        tensor_spec = InputSpec([10])
+        # insert batch_size
+        batch_tensor_spec = tensor_spec.batch(16)
+        self.assertEqual(batch_tensor_spec.shape, (16, 10))
+
+        # unbatch
+        unbatch_spec = batch_tensor_spec.unbatch()
+        self.assertEqual(unbatch_spec.shape, (10, ))
+
+        # 1. `unbatch` requires len(shape) > 1
+        with self.assertRaises(ValueError):
+            unbatch_spec.unbatch()
+
+        # 2. `batch` requires len(batch_size) == 1
+        with self.assertRaises(ValueError):
+            tensor_spec.batch([16, 12])
+
+        # 3. `batch` requires type(batch_size) == int
+        with self.assertRaises(TypeError):
+            tensor_spec.batch('16')
+
+    def test_eq_and_hash(self):
+        tensor_spec_1 = InputSpec([10, 16], dtype='float32')
+        tensor_spec_2 = InputSpec([10, 16], dtype='float32')
+        tensor_spec_3 = InputSpec([10, 16], dtype='float32', name='x')
+        tensor_spec_4 = InputSpec([16], dtype='float32', name='x')
+
+        # override ``__eq__`` according to [shape, dtype, name]
+        self.assertTrue(tensor_spec_1 == tensor_spec_2)
+        self.assertTrue(tensor_spec_1 != tensor_spec_3)  # different name
+        self.assertTrue(tensor_spec_3 != tensor_spec_4)  # different shape
+
+        # override ``__hash__``  according to [shape, dtype]
+        self.assertTrue(hash(tensor_spec_1) == hash(tensor_spec_2))
+        self.assertTrue(hash(tensor_spec_1) == hash(tensor_spec_3))
+        self.assertTrue(hash(tensor_spec_3) != hash(tensor_spec_4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
index c5228fcf122748d2518238aa21ea486ed5f60d46..eaa7e711a29c7b96691f630733d913003fce9e43 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
@@ -37,10 +37,10 @@ class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
                                   use_cuda=True,
                                   use_mem_opt=False,
                                   iter_num=5):
+        paddle.manual_seed(100)
+        paddle.framework.random._manual_program_seed(100)
         prog = Program()
         startup_prog = Program()
-        prog.random_seed = 100
-        startup_prog.random_seed = 100
         with program_guard(prog, startup_prog):
             image = layers.data(name='x', shape=[784], dtype='float32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 4d7711a5df9fc3b70bcb3137dee0bcc949135266..87b6e76a6d0ab7f5fba7c4526734d81475e1540e 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -18,12 +18,12 @@ import os
 import pickle
 import unittest
 import numpy as np
-
 import paddle
+from paddle.static import InputSpec
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME
 
 BATCH_SIZE = 32
 BATCH_NUM = 10
@@ -80,7 +80,7 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):
 
 def train(layer, input_size=784, label_size=1):
     # create optimizer
-    adam = fluid.optimizer.SGDOptimizer(
+    sgd = fluid.optimizer.SGDOptimizer(
         learning_rate=0.01, parameter_list=layer.parameters())
     # create data loader
     train_loader = fluid.io.DataLoader.from_generator(capacity=5)
@@ -97,7 +97,7 @@ def train(layer, input_size=784, label_size=1):
         avg_loss = fluid.layers.mean(loss)
 
         avg_loss.backward()
-        adam.minimize(avg_loss)
+        sgd.minimize(avg_loss)
         layer.clear_gradients()
     return [img], layer, avg_loss
 
@@ -108,7 +108,8 @@ class TestJitSaveLoad(unittest.TestCase):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
     def train_and_save_model(self, model_path=None, configs=None):
         layer = LinearNet(784, 1)
@@ -149,14 +150,14 @@ class TestJitSaveLoad(unittest.TestCase):
         train_layer.train()
         load_train_layer.train()
         # train & compare
-        _, _, train_loss = train(train_layer)
-        _, _, load_train_loss = train(load_train_layer)
+        img0, _, train_loss = train(train_layer)
+        img1, _, load_train_loss = train(load_train_layer)
         self.assertTrue(
             np.array_equal(train_loss.numpy(), load_train_loss.numpy()))
 
     def load_dygraph_state_dict(self, train_layer):
         train_layer.eval()
-        # contruct new model
+        # construct new model
         new_layer = LinearNet(784, 1)
         model_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
         new_layer.set_dict(model_dict)
@@ -176,7 +177,7 @@ class TestJitSaveLoad(unittest.TestCase):
                 model_path=self.model_path,
                 input_spec=example_inputs)
 
-    def test_load_dygraoh_no_path(self):
+    def test_load_dygraph_no_path(self):
         model_path = "model.test_jit_save_load.no_path"
         new_layer = LinearNet(784, 1)
         with self.assertRaises(ValueError):
@@ -202,12 +203,99 @@ class TestJitSaveLoad(unittest.TestCase):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
 
+class LinearNetMultiInput(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetMultiInput, self).__init__()
+        self._linear1 = Linear(in_size, out_size)
+        # self._linear2 = Linear(in_size, out_size)
+
+    @declarative(input_spec=[
+        InputSpec(
+            [None, 8], dtype='float32'), InputSpec(
+                [None, 8], dtype='float32')
+    ])
+    def forward(self, x, y):
+        x_out = self._linear1(x)
+        y_out = self._linear1(y)
+        loss = fluid.layers.mean(x_out + y_out)
+        return x_out, y_out, loss
+
+
+class TestSaveLoadWithInputSpec(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        fluid.enable_dygraph()
+
+    def test_with_input_spec(self):
+        net = LinearNetReturnLoss(8, 8)
+        # set x.shape = [None, 8]
+        net.forward = declarative(
+            net.forward, input_spec=[InputSpec(
+                [None, 8], name='x')])
+
+        model_path = "model.input_spec.output_spec"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        # check inputs and outputs
+        self.assertTrue(len(net.forward.inputs) == 1)
+        input_x = net.forward.inputs[0]
+        self.assertTrue(input_x.shape == (-1, 8))
+        self.assertTrue(input_x.name == 'x')
+
+        # 1. prune loss
+        configs.output_spec = net.forward.outputs[:1]
+        fluid.dygraph.jit.save(net, model_path, configs=configs)
+
+        # 2. load to infer
+        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        pred = infer_layer(x)
+
+    def test_multi_in_out(self):
+        net = LinearNetMultiInput(8, 8)
+
+        model_path = "model.multi_inout.output_spec1"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        # 1. check inputs and outputs
+        self.assertTrue(len(net.forward.inputs) == 2)
+        input_x = net.forward.inputs[0]
+        input_y = net.forward.inputs[1]
+        self.assertTrue(input_x.shape == (-1, 8))
+        self.assertTrue(input_y.shape == (-1, 8))
+
+        # 2. prune loss
+        configs.output_spec = net.forward.outputs[:2]
+        fluid.dygraph.jit.save(net, model_path, configs=configs)
+
+        # 3. load to infer
+        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        y = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        # 4. predict
+        pred_x, pred_y = infer_layer(x, y)
+
+        # 1. prune y and loss
+        model_path = "model.multi_inout.output_spec2"
+        configs.output_spec = net.forward.outputs[:1]
+        fluid.dygraph.jit.save(net, model_path, [input_x], configs)
+        # 2. load again
+        infer_layer2 = fluid.dygraph.jit.load(model_path, configs=configs)
+        # 3. predict
+        pred_xx = infer_layer2(x)
+
+        # 4. assert pred_x == pred_xx
+        self.assertTrue(np.allclose(pred_x.numpy(), pred_xx.numpy()))
+
+
 class TestJitSaveLoadConfig(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
     def basic_save_load(self, layer, model_path, configs):
         # 1. train & save
@@ -299,7 +387,8 @@ class TestJitMultipleLoading(unittest.TestCase):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
         # train and save base model
         self.train_and_save_orig_model()
 
@@ -340,7 +429,8 @@ class TestJitPruneModelAndLoad(unittest.TestCase):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        fluid.default_main_program().random_seed = SEED
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
 
     def train_and_save(self):
         train_layer = LinearNetReturnHidden(8, 8)
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 1992a3bb39807a62966e245d24888cc074746e8d..b76887f0965ca64b2b40bf9c0ce6e82b44fdad2f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -57,8 +57,8 @@ class LayerTest(unittest.TestCase):
     @contextlib.contextmanager
     def static_graph(self):
         with new_program_scope():
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             yield
 
     def get_static_graph_result(self,
@@ -77,8 +77,8 @@ class LayerTest(unittest.TestCase):
     def dynamic_graph(self, force_to_use_cpu=False):
         with fluid.dygraph.guard(
                 self._get_place(force_to_use_cpu=force_to_use_cpu)):
-            fluid.default_startup_program().random_seed = self.seed
-            fluid.default_main_program().random_seed = self.seed
+            paddle.manual_seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
             yield
 
 
@@ -299,7 +299,7 @@ class TestLayer(LayerTest):
                 my_syncbn = paddle.nn.SyncBatchNorm(3)
                 dy_ret = my_syncbn(base.to_variable(t))
                 dy_ret_value = dy_ret.numpy()
-            self.assertTrue(np.array_equal(static_ret, static_ret))
+            self.assertTrue(np.array_equal(static_ret, dy_ret_value))
 
     def test_relu(self):
         with self.static_graph():
@@ -1034,7 +1034,7 @@ class TestLayer(LayerTest):
             static_rlt2 = self.get_static_graph_result(
                 feed=feed_dict, fetch_list=[nce_loss2])[0]
 
-        with self.dynamic_graph(force_to_use_cpu=True):
+        with self.dynamic_graph():
             words = []
             for i in range(window_size):
                 words.append(base.to_variable(inp_word[i]))
@@ -1070,7 +1070,7 @@ class TestLayer(LayerTest):
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
         self.assertTrue(np.allclose(dy_rlt_value, static_rlt))
 
-        with self.dynamic_graph(force_to_use_cpu=True):
+        with self.dynamic_graph():
             custom_weight = np.random.randn(dict_size, 128).astype("float32")
             weight_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(
@@ -1996,13 +1996,13 @@ class TestLayer(LayerTest):
             exe = fluid.Executor(place)
 
             exe.run(fluid.default_startup_program())
-            x = np.random.rand(3, 32, 32).astype("float32")
-            y = np.array([[1], [0], [1]])
+            # x = np.random.rand(3, 32, 32).astype("float32")
+            # y = np.array([[1], [0], [1]])
             static_out = exe.run(feed={"input": x,
                                        "label": y},
                                  fetch_list=result[0])
 
-        with self.dynamic_graph():
+        with self.dynamic_graph(force_to_use_cpu=True):
             data = base.to_variable(x)
             label = base.to_variable(y)
             fc_out = fluid.layers.fc(data, size=10)
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 9a2e7b85e5202288b62a640e41e06f131b0cba84..36368a83893c7eea3e5842638b3fc677e1a1b936 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -523,491 +523,5 @@ class TestLinearWamrupLearningRateDecayWithScalarInput(unittest.TestCase):
         run_places(lr, start_lr, end_lr)
 
 
-def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
-                         var_list):
-    def is_better(current, best, m, n):
-        if m == 'min' and n == 'rel':
-            return current < best - best * threshold
-        elif m == 'min' and n == 'abs':
-            return current < best - threshold
-        elif m == 'max' and n == 'rel':
-            return current > best + best * threshold
-        else:  # mode == 'max' and epsilon_mode == 'abs':
-            return current > best + threshold
-
-    if var_list[2] > 0:
-        var_list[2] -= 1
-        return var_list[1]
-
-    if is_better(loss, var_list[0], m, n):
-        var_list[0] = loss
-        var_list[3] = 0
-    else:
-        var_list[3] += 1
-        if var_list[3] > patience:
-            var_list[2] = cooldown
-            var_list[3] = 0
-            new_lr = var_list[1] * decay_rate
-            var_list[1] = new_lr if var_list[1] - new_lr > 1e-8 else var_list[1]
-
-    return var_list[1]
-
-
-class TestReduceLROnPlateauDecay(unittest.TestCase):
-    def test_ReduceLR(self):
-        # the decay rate must be less than 1.0
-        with self.assertRaises(ValueError):
-            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0)
-        # the mode must be "min" or "max"
-        with self.assertRaises(ValueError):
-            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test")
-        # the threshold_mode must be "rel" or "abs"
-        with self.assertRaises(ValueError):
-            paddle.optimizer.ReduceLROnPlateau(
-                learning_rate=1.0, threshold_mode="test")
-        with self.assertRaises(TypeError):
-            paddle.optimizer.ReduceLROnPlateau(learning_rate="test")
-        with self.assertRaises(TypeError):
-            paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test")
-
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-
-        for place in places:
-            for m, n in zip(['min', 'max', 'min', 'max'],
-                            ['rel', 'rel', 'abs', 'abs']):
-                kwargs = {
-                    'learning_rate': 1.0,
-                    'mode': m,
-                    'factor': 0.5,
-                    'patience': 3,
-                    'threshold': 1e-4,
-                    'threshold_mode': n,
-                    'cooldown': 1,
-                    'min_lr': 0,
-                    'epsilon': 1e-8,
-                    'verbose': False,
-                }
-                paddle.enable_static()
-                self._test_static(place, kwargs)
-                paddle.disable_static(place)
-                self._test_dygraph(place, kwargs)
-                paddle.enable_static()
-
-    def _test_static(self, place, kwargs):
-        paddle.enable_static()
-
-        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
-        current_lr = 1.0
-        cooldown_counter = 0
-        num_bad_epochs = 0
-        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
-
-        main_prog = fluid.Program()
-        start_prog = fluid.Program()
-        with fluid.program_guard(main_prog, start_prog):
-            x = fluid.layers.create_global_var(
-                [1], 1, 'float32', persistable=True)
-            paddle.increment(x)
-            loss = paddle.sin(x)
-            scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
-            adam = fluid.optimizer.Adam(learning_rate=scheduler)
-            adam.minimize(loss)
-            lr_var = adam._global_learning_rate()
-            test_prog = main_prog.clone()
-
-        exe = fluid.Executor(place)
-        exe.run(start_prog)
-
-        for epoch in range(20):
-            for batch_id in range(1):
-                out, actual_lr = exe.run(main_prog,
-                                         fetch_list=[loss.name, lr_var.name])
-                expected_lr = reduce_lr_on_plateau(
-                    kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
-                    kwargs['patience'], kwargs['mode'],
-                    kwargs['threshold_mode'], out[0], var_list)
-
-            scheduler.step(out[0])
-            actual_lr = scheduler()
-            self.assertEqual(actual_lr, np.array(expected_lr))
-
-        for epoch in range(10):
-            for batch_id in range(1):
-                out, actual_lr = exe.run(test_prog,
-                                         fetch_list=[loss.name, lr_var.name])
-                expected_lr = reduce_lr_on_plateau(
-                    kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
-                    kwargs['patience'], kwargs['mode'],
-                    kwargs['threshold_mode'], out[0], var_list)
-            scheduler.step(out[0])
-            actual_lr = scheduler()
-            self.assertEqual(actual_lr, np.array(expected_lr))
-
-    def _test_dygraph(self, place, kwargs):
-        paddle.disable_static(place)
-
-        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
-        current_lr = 1.0
-        cooldown_counter = 0
-        num_bad_epochs = 0
-        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
-
-        linear = paddle.nn.Linear(10, 10)
-        scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
-        sgd = paddle.optimizer.SGD(learning_rate=scheduler,
-                                   parameter_list=linear.parameters())
-
-        for epoch in range(20):
-            for batch_id in range(1):
-                x = paddle.to_tensor(epoch).astype('float32')
-                loss = paddle.sin(x)
-                loss.backward()
-                sgd.minimize(loss)
-
-            scheduler.step(loss)
-            # get lr from paddle
-            current_lr = scheduler()
-            # get lr form python
-            expected_lr = reduce_lr_on_plateau(
-                kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
-                kwargs['patience'], kwargs['mode'], kwargs['threshold_mode'],
-                loss, var_list)
-            self.assertEqual(current_lr, expected_lr)
-        state_dict = sgd.state_dict()
-        scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs)
-        sgd1 = paddle.optimizer.SGD(learning_rate=scheduler1,
-                                    parameter_list=linear.parameters())
-        sgd1.set_dict(state_dict)
-        self.assertEqual(scheduler.cooldown_counter,
-                         scheduler1.cooldown_counter)
-        self.assertEqual(scheduler.best.numpy()[0], scheduler1.best)
-        self.assertEqual(scheduler.num_bad_epochs, scheduler1.num_bad_epochs)
-        self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch)
-        self.assertEqual(scheduler.last_lr, scheduler1.last_lr)
-
-
-def noam_lr(epoch_num, d_model, warmup_steps, learning_rate=1.0, verbose=False):
-    if epoch_num == 0:
-        a = 1
-    else:
-        a = math.pow(epoch_num, -0.5)
-    b = math.pow(warmup_steps, -1.5) * epoch_num
-    return learning_rate * math.pow(d_model, -0.5) * min(a, b)
-
-
-def lambda_lr(epoch_num, learning_rate, lr_lambda, verbose=False):
-    return learning_rate * lr_lambda(epoch_num)
-
-
-def piecewise_lr(epoch_num, boundaries, values, verbose=False):
-    assert len(boundaries) + 1 == len(values)
-    for i in range(len(boundaries)):
-        if epoch_num < boundaries[i]:
-            return values[i]
-    return values[len(values) - 1]
-
-
-def exponential_lr(epoch_num, learning_rate, gamma, verbose=False):
-    return learning_rate * gamma**epoch_num
-
-
-def natural_exp_lr(epoch_num, learning_rate, gamma, verbose=False):
-    return learning_rate * math.exp(-1 * gamma * epoch_num)
-
-
-def inverse_time_lr(epoch_num, learning_rate, gamma, verbose=False):
-    return learning_rate / (1 + gamma * epoch_num)
-
-
-def polynomial_lr(epoch_num,
-                  learning_rate,
-                  decay_steps,
-                  end_lr=0.0001,
-                  power=1.0,
-                  cycle=False,
-                  verbose=False):
-
-    if cycle:
-        div = math.ceil(epoch_num / float(decay_steps))
-        if epoch_num == 0:
-            div = 1
-        decay_steps = decay_steps * div
-    else:
-        epoch_num = min(epoch_num, decay_steps)
-    return (learning_rate - end_lr) * (
-        (1 - float(epoch_num) / float(decay_steps))**power) + end_lr
-
-    def get_lr(self):
-        if self.last_epoch == 0:
-            return self.base_lr
-        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
-            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
-                math.pi / self.T_max)) / 2
-
-        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
-            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
-                self.last_lr - self.eta_min) + self.eta_min
-
-
-cosine_annealing_lr_current = None
-
-
-def cosine_annealing_lr(epoch_num,
-                        learning_rate,
-                        T_max,
-                        eta_min=0,
-                        verbose=False):
-    global cosine_annealing_lr_current
-    if epoch_num == 0:
-        cosine_annealing_lr_current = learning_rate
-    elif (epoch_num - 1 - T_max) % (2 * T_max) == 0:
-        cosine_annealing_lr_current = cosine_annealing_lr_current + (
-            learning_rate - eta_min) * (1 - math.cos(math.pi / float(T_max))
-                                        ) / 2
-    else:
-        cosine_annealing_lr_current = (1 + math.cos(
-            math.pi * epoch_num / float(T_max))) / (1 + math.cos(math.pi * (
-                epoch_num - 1) / float(T_max))) * (cosine_annealing_lr_current -
-                                                   eta_min) + eta_min
-    return cosine_annealing_lr_current
-
-
-def linear_warmup_lr(epoch_num,
-                     learning_rate,
-                     warmup_steps,
-                     start_lr,
-                     end_lr,
-                     verbose=False):
-    if epoch_num < warmup_steps:
-        return start_lr + (end_lr - start_lr) * (float(epoch_num) /
-                                                 float(warmup_steps))
-    else:
-        return learning_rate
-
-
-def multi_step_lr(epoch_num,
-                  learning_rate,
-                  milestones,
-                  gamma=0.1,
-                  verbose=False):
-    for i in range(len(milestones)):
-        if epoch_num < milestones[i]:
-            return learning_rate * (gamma**i)
-    return learning_rate * (gamma**len(milestones))
-
-
-def step_lr(epoch_num, learning_rate, step_size, gamma=0.1, verbose=False):
-    return learning_rate * math.pow(gamma, epoch_num // step_size)
-
-
-class TestLRScheduler(unittest.TestCase):
-    def _test_static(self, python_func, paddle_api, kwarg, place):
-        main_prog = fluid.Program()
-        start_prog = fluid.Program()
-        with fluid.program_guard(main_prog, start_prog):
-            x = fluid.data(name='x', shape=[3, 4, 5])
-            y = fluid.data(name='y', shape=[3, 4, 5])
-            z = fluid.layers.fc(x, 100)
-            loss = fluid.layers.mean(z)
-            scheduler = paddle_api(**kwarg)
-            adam = fluid.optimizer.Adam(learning_rate=scheduler)
-            adam.minimize(loss)
-            lr_var = adam._global_learning_rate()
-            test_prog = main_prog.clone()
-
-        num = 0
-        exe = fluid.Executor(place)
-        exe.run(start_prog)
-        for epoch in range(5):
-            for batch_id in range(2):
-                out = exe.run(
-                    main_prog,
-                    feed={
-                        'x': np.random.randn(3, 4, 5).astype('float32'),
-                        'y': np.random.randn(3, 4, 5).astype('float32')
-                    },
-                    fetch_list=lr_var.name)
-            self.assertEqual(out, np.array(python_func(num, **kwarg)))
-            scheduler.step()
-            num += 1
-
-        for epoch in range(5):
-            for batch_id in range(2):
-                out = exe.run(
-                    test_prog,
-                    feed={
-                        'x': np.random.randn(3, 4, 5).astype('float32'),
-                        'y': np.random.randn(3, 4, 5).astype('float32')
-                    },
-                    fetch_list=lr_var.name)
-            self.assertEqual(out, np.array(python_func(num, **kwarg)))
-            scheduler.step()
-            num += 1
-
-        if isinstance(place, fluid.CPUPlace):
-            compiled_train_prog = fluid.CompiledProgram(
-                main_prog).with_data_parallel(
-                    loss_name=loss.name, places=fluid.cpu_places(4))
-            for epoch in range(5):
-                python_result = python_func(num, **kwarg)
-                for batch_id in range(2):
-                    _ = exe.run(
-                        compiled_train_prog,
-                        feed={
-                            'x': np.random.randn(12, 4, 5).astype('float32'),
-                            'y': np.random.randn(12, 4, 5).astype('float32')
-                        },
-                        fetch_list=lr_var.name)
-                scopes = compiled_train_prog._executor.local_scopes()
-                out = np.array(scopes[0].var(lr_var.name).get_tensor())
-                self.assertEqual(out, np.array(python_result))
-                out = np.array(scopes[1].var(lr_var.name).get_tensor())
-                self.assertEqual(out, np.array(python_result))
-                out = np.array(scopes[2].var(lr_var.name).get_tensor())
-                self.assertEqual(out, np.array(python_result))
-                out = np.array(scopes[3].var(lr_var.name).get_tensor())
-                self.assertEqual(out, np.array(python_result))
-                scheduler.step()
-                num += 1
-
-            compiled_test_prog = fluid.CompiledProgram(
-                test_prog).with_data_parallel(
-                    loss_name=loss.name,
-                    share_vars_from=compiled_train_prog,
-                    places=fluid.cpu_places(4))
-            for epoch in range(5):
-                python_result = python_func(num, **kwarg)
-                for batch_id in range(2):
-                    _ = exe.run(
-                        compiled_test_prog,
-                        feed={
-                            'x': np.random.randn(12, 4, 5).astype('float32'),
-                            'y': np.random.randn(12, 4, 5).astype('float32')
-                        },
-                        fetch_list=lr_var.name)
-                scopes = compiled_test_prog._executor.local_scopes()
-                out = np.array(scopes[0].var(lr_var.name).get_tensor())
-                self.assertEqual(out, np.array(python_result))
-                out = np.array(scopes[1].var(lr_var.name).get_tensor())
-                self.assertEqual(out, np.array(python_result))
-                out = np.array(scopes[2].var(lr_var.name).get_tensor())
-                self.assertEqual(out, np.array(python_result))
-                out = np.array(scopes[3].var(lr_var.name).get_tensor())
-                self.assertEqual(out, np.array(python_result))
-                scheduler.step()
-                num += 1
-
-    def _test_dygraph(self, python_func, paddle_api, kwarg, place):
-        x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
-        linear = paddle.nn.Linear(10, 10)
-        scheduler = paddle_api(**kwarg)
-        sgd = paddle.optimizer.SGD(learning_rate=scheduler,
-                                   parameter_list=linear.parameters())
-        for epoch in range(20):
-            for batch_id in range(2):
-                x = paddle.to_tensor(x)
-                out = linear(x)
-                loss = paddle.reduce_mean(out)
-                out.backward()
-                sgd.minimize(loss)
-                linear.clear_gradients()
-
-            self.assertAlmostEqual(sgd.current_step_lr(),
-                                   python_func(epoch, **kwarg))
-            if paddle_api.__name__ != "CosineAnnealingLR":
-                scheduler.step()
-            else:
-                scheduler.step(epoch + 1)
-
-    def test_scheduler(self):
-        with self.assertRaises(NotImplementedError):
-            paddle.optimizer.lr_scheduler._LRScheduler().step()
-        with self.assertRaises(TypeError):
-            paddle.optimizer.MultiStepLR(
-                learning_rate="test", milestones=[1, 2, 3])
-        with self.assertRaises(TypeError):
-            paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test')
-        with self.assertRaises(ValueError):
-            paddle.optimizer.MultiStepLR(
-                learning_rate=0.5, milestones=[3, 2, 1])
-        with self.assertRaises(ValueError):
-            paddle.optimizer.MultiStepLR(
-                learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
-
-        func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, {
-            "d_model": 0.01,
-            "warmup_steps": 100,
-            "verbose": False
-        }), (piecewise_lr, paddle.optimizer.PiecewiseLR, {
-            "boundaries": [3, 6, 9, 15, 20],
-            "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
-            "verbose": False
-        }), (natural_exp_lr, paddle.optimizer.NaturalExpLR, {
-            "learning_rate": 0.5,
-            "gamma": 0.1,
-            "verbose": False
-        }), (inverse_time_lr, paddle.optimizer.InverseTimeLR, {
-            "learning_rate": 0.5,
-            "gamma": 0.1,
-            "verbose": True
-        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
-            "learning_rate": 0.5,
-            "decay_steps": 20,
-            "end_lr": 0,
-            "power": 1.0,
-            "cycle": False,
-            "verbose": False
-        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
-            "learning_rate": 0.5,
-            "decay_steps": 20,
-            "end_lr": 0,
-            "power": 1.0,
-            "cycle": True,
-            "verbose": False
-        }), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, {
-            'learning_rate': 0.5,
-            'warmup_steps': 20,
-            'start_lr': 0,
-            'end_lr': 0.5,
-            "verbose": False
-        }), (exponential_lr, paddle.optimizer.ExponentialLR, {
-            "learning_rate": 0.5,
-            "gamma": 0.9,
-            "verbose": False
-        }), (multi_step_lr, paddle.optimizer.MultiStepLR, {
-            "learning_rate": 0.5,
-            "milestones": [3, 6, 9, 15, 20],
-            "gamma": 0.8,
-            "verbose": True
-        }), (step_lr, paddle.optimizer.StepLR, {
-            "learning_rate": 0.5,
-            "step_size": 2,
-            "gamma": 0.8,
-            "verbose": False
-        }), (lambda_lr, paddle.optimizer.LambdaLR, {
-            "learning_rate": 0.5,
-            "lr_lambda": lambda x: 0.95**x,
-            "verbose": False
-        }), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, {
-            "learning_rate": 0.5,
-            "T_max": 10,
-            "verbose": True
-        })]
-
-        for python_func, paddle_api, kwarg in func_api_kwargs:
-            places = [fluid.CPUPlace()]
-            if core.is_compiled_with_cuda():
-                places.append(fluid.CUDAPlace(0))
-
-            for place in places:
-                paddle.enable_static()
-                self._test_static(python_func, paddle_api, kwarg, place)
-                paddle.disable_static(place)
-                self._test_dygraph(python_func, paddle_api, kwarg, place)
-                paddle.enable_static()
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linear.py b/python/paddle/fluid/tests/unittests/test_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d07a80da15dbfd35ffdedbcb09e82d59a84486e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linear.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+
+
+class LinearTestCase(unittest.TestCase):
+    def setUp(self):
+        self.dtype = 'float32'
+        self.input = np.ones((3, 1, 2)).astype(self.dtype)
+        self.weight = np.ones((2, 2)).astype(self.dtype)
+        self.bias = np.ones((2)).astype(self.dtype)
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+
+    def functional(self, place):
+        paddle.disable_static(place)
+        input = paddle.to_tensor(self.input)
+        weight = paddle.to_tensor(self.weight)
+        bias = paddle.to_tensor(self.bias)
+        out = F.linear(input, weight, bias)
+        return out.numpy()
+
+    def paddle_nn_layer(self, place):
+        paddle.disable_static(place)
+        input = paddle.to_tensor(self.input)
+        weight_attr = fluid.ParamAttr(
+            name="linear_weight",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+        bias_attr = fluid.ParamAttr(
+            name="linear_bias",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+        linear = paddle.nn.Linear(
+            2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+        y = linear(input)
+        return y.numpy()
+
+    def numpy_cal(self):
+        res = np.matmul(self.input, self.weight) + self.bias
+        return res
+
+    def test_error(self, place=paddle.CPUPlace()):
+        res_f = self.functional(place)
+        res_nn = self.paddle_nn_layer(place)
+        res_np = self.numpy_cal()
+        np.testing.assert_array_almost_equal(res_f, res_nn)
+        np.testing.assert_array_almost_equal(res_nn, res_np)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
index 98f7cd5b6b2dc8c82a71edf7ec36a24921726e3c..53e8b02081ae3acf8a7fb5dd2bc6e05cbc3be901 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
-from paddle.nn.functional import *
+from paddle.nn.functional import interpolate
 
 
 def linear_interp_np(input,
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..04b56677fc158583fe79ec0dc1276210bd2ebbdc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
@@ -0,0 +1,438 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import platform
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+from paddle.nn.functional import interpolate
+
+
+def linear_interp_np(input,
+                     out_w,
+                     out_size=None,
+                     actual_shape=None,
+                     align_corners=True,
+                     align_mode=0,
+                     data_layout='NCHW'):
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 2, 1))  # NHWC => NCHW
+    if out_size is not None:
+        out_w = out_size[0]
+    if actual_shape is not None:
+        out_w = actual_shape[0]
+    batch_size, channel, in_w = input.shape
+
+    ratio_w = 0.0
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_w))
+
+    for j in range(out_w):
+        if (align_mode == 0 and not align_corners):
+            w = int(ratio_w * (j + 0.5) - 0.5)
+        else:
+            w = int(ratio_w * j)
+        w = max(0, w)
+        wid = 1 if w < in_w - 1 else 0
+
+        if (align_mode == 0 and not align_corners):
+            idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0)
+            w1lambda = idx_src_w - w
+        else:
+            w1lambda = ratio_w * j - w
+        w2lambda = 1.0 - w1lambda
+
+        out[:, :, j] = w2lambda * input[:, :, w] + w1lambda * input[:, :, w +
+                                                                    wid]
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 1))  # NCHW => NHWC
+
+    return out.astype(input.dtype)
+
+
+class TestLinearInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "linear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_w = self.input_shape[2]
+        else:
+            in_w = self.input_shape[1]
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = float(self.scale)
+            if isinstance(self.scale, list):
+                self.scale = float(self.scale[0])
+            out_w = int(in_w * self.scale)
+        else:
+            out_w = self.out_w
+
+        output_np = linear_interp_np(input_np, out_w, self.out_size,
+                                     self.actual_shape, self.align_corners,
+                                     self.align_mode, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [float(self.scale)]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        if platform.system() == "Linux":
+            self.check_output(atol=1e-7)
+        else:
+            self.check_output(atol=1e-5)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestLinearInterpOpDataLayout(TestLinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 1
+        self.data_layout = 'NHWC'
+
+
+class TestLinearInterpOpAlignMode(TestLinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestLinearInterpOpScale(TestLinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 100]
+        self.out_w = 50
+        self.scale = 0.5
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestLinearInterpOpSizeTensor(TestLinearInterpOp):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "linear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+
+        if self.data_layout == "NCHW":
+            in_w = self.input_shape[2]
+        else:
+            in_w = self.input_shape[1]
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = float(self.scale)
+            if isinstance(self.scale, list):
+                self.scale = float(self.scale[0])
+            out_w = int(in_w * self.scale)
+        else:
+            out_w = self.out_w
+
+        output_np = linear_interp_np(input_np, out_w, self.out_size,
+                                     self.actual_shape, self.align_corners,
+                                     self.align_mode, self.data_layout)
+
+        self.inputs = {'X': input_np}
+        if self.out_size is not None and self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.actual_shape is not None and self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.actual_shape
+        else:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs = {
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+
+class TestResizeLinearAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[1, 3, 64], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[1], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[1], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_linear(
+            x, out_shape=[128, ], align_mode=1, align_corners=False)
+        out2 = fluid.layers.resize_linear(
+            x, out_shape=[128], align_mode=1, align_corners=False)
+        out3 = fluid.layers.resize_linear(
+            x, out_shape=shape_tensor, align_mode=1, align_corners=False)
+        out4 = fluid.layers.resize_linear(
+            x,
+            out_shape=[128, ],
+            actual_shape=actual_size,
+            align_mode=1,
+            align_corners=False)
+        out5 = fluid.layers.resize_linear(
+            x, scale=scale_tensor, align_mode=1, align_corners=False)
+
+        out6 = interpolate(
+            x,
+            scale_factor=scale_tensor,
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+        out7 = interpolate(
+            x,
+            size=[128, ],
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+        out8 = interpolate(
+            x,
+            size=shape_tensor,
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+
+        x_data = np.random.random((1, 3, 64)).astype("float32")
+        dim_data = np.array([128]).astype("int32")
+        shape_data = np.array([128, ]).astype("int32")
+        actual_size_data = np.array([128, ]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "x": x_data,
+                "dim": dim_data,
+                "shape_tensor": shape_data,
+                "actual_size": actual_size_data,
+                "scale_tensor": scale_data
+            },
+            fetch_list=[out1, out2, out3, out4, out5, out6, out7, out8],
+            return_numpy=True)
+
+        expect_res = linear_interp_np(
+            x_data, out_w=128, align_mode=1, align_corners=False)
+        for res in results:
+            self.assertTrue(np.allclose(res, expect_res))
+
+
+class TestLinearInterpOpAPI2_0(unittest.TestCase):
+    def test_case(self):
+
+        # dygraph 
+        x_data = np.random.random((1, 3, 128)).astype("float32")
+        us_1 = paddle.nn.UpSample(
+            size=[64, ],
+            mode='linear',
+            align_mode=1,
+            align_corners=False,
+            data_format='NCW')
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = us_1(x)
+
+            expect = linear_interp_np(
+                x_data, out_w=64, align_mode=1, align_corners=False)
+
+            self.assertTrue(np.allclose(interp.numpy(), expect))
+
+
+class TestResizeLinearOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "linear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("uint8")
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = float(self.scale)
+            if isinstance(self.scale, list):
+                self.scale = float(self.scale[0])
+            out_w = int(self.input_shape[2] * self.scale)
+        else:
+            out_w = self.out_w
+
+        output_np = linear_interp_np(input_np, out_w, self.out_size,
+                                     self.actual_shape, self.align_corners,
+                                     self.align_mode)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+
+        self.attrs = {
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        if platform.system() == "Linux":
+            self.check_output_with_place(place=core.CPUPlace(), atol=1e-7)
+        else:
+            self.check_output_with_place(place=core.CPUPlace(), atol=1e-5)
+
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [2, 3, 100]
+        self.out_w = 50
+        self.scale = 0.
+        self.out_size = np.array([50, ]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestLinearInterpOpException(unittest.TestCase):
+    def test_exception(self):
+        def input_shape_error():
+            x1 = fluid.data(name="x1", shape=[1], dtype="float32")
+            out = fluid.layers.resize_linear(
+                x1, out_shape=[256, ], data_format='NCW')
+
+        def data_format_error():
+            x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
+            out = fluid.layers.resize_linear(
+                x2, out_shape=[256, ], data_format='NHWCD')
+
+        def out_shape_error():
+            x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
+            out = fluid.layers.resize_linear(
+                x3, out_shape=[
+                    256,
+                    256,
+                ], data_format='NHWC')
+
+        self.assertRaises(ValueError, input_shape_error)
+        self.assertRaises(ValueError, data_format_error)
+        self.assertRaises(ValueError, out_shape_error)
+
+
+class TestLinearInterpOpError(unittest.TestCase):
+    def test_error(self):
+        with program_guard(Program(), Program()):
+
+            def input_shape_error():
+                x1 = fluid.data(name="x1", shape=[1], dtype="float32")
+                out1 = paddle.nn.UpSample(
+                    size=[256, ], data_format='NCW', mode='linear')
+                out1_res = out1(x1)
+
+            def data_format_error():
+                x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
+                out2 = paddle.nn.UpSample(
+                    size=[256, ], data_format='NHWCD', mode='linear')
+                out2_res = out2(x2)
+
+            def out_shape_error():
+                x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
+                out3 = paddle.nn.UpSample(
+                    size=[
+                        256,
+                        256,
+                    ], data_format='NHWC', mode='linear')
+                out3_res = out3(x3)
+
+            self.assertRaises(ValueError, input_shape_error)
+            self.assertRaises(ValueError, data_format_error)
+            self.assertRaises(ValueError, out_shape_error)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py
index b26b6ab6c3ce7cc68ad877b183eb8733293b9228..c8bb8c5b73f7680fc8a329656ef2b899f14d96ea 100755
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
@@ -21,59 +21,231 @@ import paddle
 import paddle.fluid as fluid
 from paddle.static import Program, program_guard
 
+TEST_META_OP_DATA = [{
+    'op_str': 'logical_and',
+    'binary_op': True
+}, {
+    'op_str': 'logical_or',
+    'binary_op': True
+}, {
+    'op_str': 'logical_xor',
+    'binary_op': True
+}, {
+    'op_str': 'logical_not',
+    'binary_op': False
+}]
 
-def create_test_class(op_type, callback, binary_op=True):
-    class Cls(op_test.OpTest):
-        def setUp(self):
-            a = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
-            if binary_op:
-                b = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
-                c = callback(a, b)
-            else:
-                c = callback(a)
-            self.outputs = {'Out': c}
-            self.op_type = op_type
-            if binary_op:
-                self.inputs = {'X': a, 'Y': b}
+TEST_META_SHAPE_DATA = {
+    'XDimLargerThanYDim1': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 5]
+    },
+    'XDimLargerThanYDim2': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 1]
+    },
+    'XDimLargerThanYDim3': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [1, 4, 1]
+    },
+    'XDimLargerThanYDim4': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [3, 4, 1]
+    },
+    'XDimLargerThanYDim5': {
+        'x_shape': [2, 3, 1, 5],
+        'y_shape': [3, 1, 1]
+    },
+    'XDimLessThanYDim1': {
+        'x_shape': [4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim2': {
+        'x_shape': [1, 4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim3': {
+        'x_shape': [3, 4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim4': {
+        'x_shape': [3, 1, 1],
+        'y_shape': [2, 3, 1, 5]
+    },
+    'XDimLessThanYDim5': {
+        'x_shape': [4, 5],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'Axis1InLargerDim': {
+        'x_shape': [1, 4, 5],
+        'y_shape': [2, 3, 1, 5]
+    },
+    'EqualDim1': {
+        'x_shape': [10, 7],
+        'y_shape': [10, 7]
+    },
+    'EqualDim2': {
+        'x_shape': [1, 1, 4, 5],
+        'y_shape': [2, 3, 1, 5]
+    }
+}
+
+TEST_META_WRONG_SHAPE_DATA = {
+    'ErrorDim1': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [3, 4]
+    },
+    'ErrorDim2': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 3]
+    }
+}
+
+
+def run_static(x_np, y_np, op_str, use_gpu=False, binary_op=True):
+    paddle.enable_static()
+    startup_program = fluid.Program()
+    main_program = fluid.Program()
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    with fluid.program_guard(main_program, startup_program):
+        x = paddle.static.data(name='x', shape=x_np.shape, dtype='bool')
+        op = getattr(paddle, op_str)
+        feed_list = {'x': x_np}
+        if not binary_op:
+            res = op(x)
+        else:
+            y = paddle.static.data(name='y', shape=y_np.shape, dtype='bool')
+            feed_list['y'] = y_np
+            res = op(x, y)
+        exe.run(startup_program)
+        static_result = exe.run(main_program, feed=feed_list, fetch_list=[res])
+    return static_result
+
+
+def run_dygraph(x_np, y_np, op_str, use_gpu=False, binary_op=True):
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    paddle.disable_static(place)
+    op = getattr(paddle, op_str)
+    x = paddle.to_tensor(x_np)
+    if not binary_op:
+        dygraph_result = op(x)
+    else:
+        y = paddle.to_tensor(y_np)
+        dygraph_result = op(x, y)
+    return dygraph_result
+
+
+def np_data_generator(np_shape, *args, **kwargs):
+    return np.random.choice(a=[True, False], size=np_shape).astype(bool)
+
+
+def test(unit_test, use_gpu=False, test_error=False):
+    for op_data in TEST_META_OP_DATA:
+        meta_data = dict(op_data)
+        meta_data['use_gpu'] = use_gpu
+        np_op = getattr(np, meta_data['op_str'])
+        META_DATA = dict(TEST_META_SHAPE_DATA)
+        if test_error:
+            META_DATA = dict(TEST_META_WRONG_SHAPE_DATA)
+        for shape_data in META_DATA.values():
+            meta_data['x_np'] = np_data_generator(shape_data['x_shape'])
+            meta_data['y_np'] = np_data_generator(shape_data['y_shape'])
+            if meta_data['binary_op'] and test_error:
+                # catch C++ Exception
+                unit_test.assertRaises(BaseException, run_static, **meta_data)
+                unit_test.assertRaises(BaseException, run_dygraph, **meta_data)
+                continue
+            static_result = run_static(**meta_data)
+            dygraph_result = run_dygraph(**meta_data)
+            if meta_data['binary_op']:
+                np_result = np_op(meta_data['x_np'], meta_data['y_np'])
             else:
-                self.inputs = {'X': a}
-
-        def test_output(self):
-            self.check_output()
-
-        def test_error(self):
-            with program_guard(Program(), Program()):
-
-                # test 1 type error, x, y must be bool type
-                x = fluid.layers.data(name='x', shape=[2], dtype='bool')
-                y = fluid.layers.data(name='y', shape=[2], dtype='bool')
-                a = fluid.layers.data(name='a', shape=[2], dtype='int32')
-                op = eval("fluid.layers.%s" % self.op_type)
-                if self.op_type != "logical_not":
-                    self.assertRaises(TypeError, op, x=x, y=y, out=1)
-                    self.assertRaises(TypeError, op, x=x, y=a)
-                    self.assertRaises(TypeError, op, x=a, y=y)
-                else:
-                    self.assertRaises(TypeError, op, x=x, out=1)
-                    self.assertRaises(TypeError, op, x=a)
-
-                # test 2 type error, x, y must be same shape
-                x_data = fluid.layers.data(
-                    name='x_data', shape=[2], dtype='bool')
-                y_data = fluid.layers.data(
-                    name='y_data', shape=[2, 2], dtype='bool')
-
-                if self.op_type != "logical_not":
-                    self.assertRaises(TypeError, op, x=x_data, y=y_data, out=1)
-                    self.assertRaises(TypeError, op, x=y_data, y=x_data)
-
-    globals()[op_type] = Cls
-
-
-create_test_class('logical_and', lambda _a, _b: np.logical_and(_a, _b))
-create_test_class('logical_or', lambda _a, _b: np.logical_or(_a, _b))
-create_test_class('logical_not', lambda _a: np.logical_not(_a), False)
-create_test_class('logical_xor', lambda _a, _b: np.logical_xor(_a, _b))
+                np_result = np_op(meta_data['x_np'])
+            unit_test.assertTrue((static_result == np_result).all())
+            unit_test.assertTrue((dygraph_result.numpy() == np_result).all())
+
+
+def test_type_error(unit_test, use_gpu, type_str_map):
+    def check_type(op_str, x, y, binary_op):
+        op = getattr(paddle, op_str)
+        error_type = TypeError
+        if isinstance(x, np.ndarray):
+            x = paddle.to_tensor(x)
+            y = paddle.to_tensor(y)
+            error_type = BaseException
+        if binary_op:
+            if type_str_map['x'] != 'bool' or type_str_map['y'] != 'bool':
+                unit_test.assertRaises(error_type, op, x=x, y=y)
+            if not fluid.in_dygraph_mode():
+                unit_test.assertRaises(error_type, op, x=x, y=y, out=1)
+        else:
+            if type_str_map['x'] != 'bool':
+                unit_test.assertRaises(error_type, op, x=x)
+            if not fluid.in_dygraph_mode():
+                unit_test.assertRaises(error_type, op, x=x, out=1)
+
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    for op_data in TEST_META_OP_DATA:
+        meta_data = dict(op_data)
+        binary_op = meta_data['binary_op']
+
+        paddle.disable_static(place)
+        x = np.random.choice(a=[0, 1], size=[10]).astype(type_str_map['x'])
+        y = np.random.choice(a=[0, 1], size=[10]).astype(type_str_map['y'])
+        check_type(meta_data['op_str'], x, y, binary_op)
+
+        paddle.enable_static()
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(
+                name='x', shape=[10], dtype=type_str_map['x'])
+            y = paddle.static.data(
+                name='y', shape=[10], dtype=type_str_map['y'])
+            check_type(meta_data['op_str'], x, y, binary_op)
+
+
+def type_map_factory():
+    x_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
+    y_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
+    return [{
+        'x': x_type,
+        'y': y_type
+    } for x_type in x_type_list for y_type in y_type_list]
+
+
+class TestCPU(unittest.TestCase):
+    def test(self):
+        test(self)
+
+    def test_error(self):
+        test(self, False, True)
+
+    def test_type_error(self):
+        type_map_list = type_map_factory()
+        for type_map in type_map_list:
+            test_type_error(self, False, type_map)
+
+
+class TestCUDA(unittest.TestCase):
+    def test(self):
+        test(self, True)
+
+    def test_error(self):
+        test(self, True, True)
+
+    def test_type_error(self):
+        type_map_list = type_map_factory()
+        for type_map in type_map_list:
+            test_type_error(self, True, type_map)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f655e363e964893a7ab4c0a966856f873800ff6c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -0,0 +1,519 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import copy
+import math
+import numpy as np
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.framework as framework
+import paddle.fluid.core as core
+
+
+def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
+                         var_list):
+    def is_better(current, best, m, n):
+        if m == 'min' and n == 'rel':
+            return current < best - best * threshold
+        elif m == 'min' and n == 'abs':
+            return current < best - threshold
+        elif m == 'max' and n == 'rel':
+            return current > best + best * threshold
+        else:  # mode == 'max' and epsilon_mode == 'abs':
+            return current > best + threshold
+
+    if var_list[2] > 0:
+        var_list[2] -= 1
+        return var_list[1]
+
+    if is_better(loss, var_list[0], m, n):
+        var_list[0] = loss
+        var_list[3] = 0
+    else:
+        var_list[3] += 1
+        if var_list[3] > patience:
+            var_list[2] = cooldown
+            var_list[3] = 0
+            new_lr = var_list[1] * decay_rate
+            var_list[1] = new_lr if var_list[1] - new_lr > 1e-8 else var_list[1]
+
+    return var_list[1]
+
+
+class TestReduceLROnPlateauDecay(object):
+    def test_ReduceLR(self):
+        # the decay rate must be less than 1.0
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0)
+        # the mode must be "min" or "max"
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test")
+        # the threshold_mode must be "rel" or "abs"
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(
+                learning_rate=1.0, threshold_mode="test")
+        with self.assertRaises(TypeError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate="test")
+        with self.assertRaises(TypeError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test")
+
+        places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+
+        for place in places:
+            for m, n in zip(['min', 'max', 'min', 'max'],
+                            ['rel', 'rel', 'abs', 'abs']):
+                kwargs = {
+                    'learning_rate': 1.0,
+                    'mode': m,
+                    'factor': 0.5,
+                    'patience': 3,
+                    'threshold': 1e-4,
+                    'threshold_mode': n,
+                    'cooldown': 1,
+                    'min_lr': 0,
+                    'epsilon': 1e-8,
+                    'verbose': False,
+                }
+                paddle.enable_static()
+                self._test_static(place, kwargs)
+                paddle.disable_static(place)
+                self._test_dygraph(place, kwargs)
+                paddle.enable_static()
+
+    def _test_static(self, place, kwargs):
+        paddle.enable_static()
+
+        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
+        current_lr = 1.0
+        cooldown_counter = 0
+        num_bad_epochs = 0
+        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
+
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            x = fluid.layers.create_global_var(
+                [1], 1, 'float32', persistable=True)
+            paddle.increment(x)
+            loss = paddle.sin(x)
+            scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+            adam = paddle.optimizer.Adam(learning_rate=scheduler)
+            adam.minimize(loss)
+            lr_var = adam._global_learning_rate()
+            test_prog = main_prog.clone()
+
+        exe = paddle.static.Executor(place)
+        exe.run(start_prog)
+
+        for epoch in range(20):
+            for batch_id in range(1):
+                out, actual_lr = exe.run(main_prog,
+                                         fetch_list=[loss.name, lr_var.name])
+                expected_lr = reduce_lr_on_plateau(
+                    kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                    kwargs['patience'], kwargs['mode'],
+                    kwargs['threshold_mode'], out[0], var_list)
+
+            scheduler.step(out[0])
+            actual_lr = scheduler()
+            self.assertEqual(actual_lr, np.array(expected_lr))
+
+        for epoch in range(10):
+            for batch_id in range(1):
+                out, actual_lr = exe.run(test_prog,
+                                         fetch_list=[loss.name, lr_var.name])
+                expected_lr = reduce_lr_on_plateau(
+                    kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                    kwargs['patience'], kwargs['mode'],
+                    kwargs['threshold_mode'], out[0], var_list)
+            scheduler.step(out[0])
+            actual_lr = scheduler()
+            self.assertEqual(actual_lr, np.array(expected_lr))
+
+    def _test_dygraph(self, place, kwargs):
+        paddle.disable_static(place)
+
+        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
+        current_lr = 1.0
+        cooldown_counter = 0
+        num_bad_epochs = 0
+        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
+
+        linear = paddle.nn.Linear(10, 10)
+        scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+        adam = paddle.optimizer.Adam(
+            learning_rate=scheduler, parameters=linear.parameters())
+
+        for epoch in range(20):
+            for batch_id in range(1):
+                x = paddle.to_tensor(epoch).astype('float32')
+                loss = paddle.sin(x)
+                loss.backward()
+                adam.step()
+                adam.clear_grad()
+
+            scheduler.step(loss)
+            # get lr from paddle
+            current_lr = adam.get_lr()
+            # get lr form python
+            expected_lr = reduce_lr_on_plateau(
+                kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                kwargs['patience'], kwargs['mode'], kwargs['threshold_mode'],
+                loss, var_list)
+            self.assertEqual(current_lr, expected_lr)
+        state_dict = adam.state_dict()
+        scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+        adam1 = paddle.optimizer.Adam(
+            learning_rate=scheduler1, parameters=linear.parameters())
+        adam1.set_state_dict(state_dict)
+        self.assertEqual(scheduler.cooldown_counter,
+                         scheduler1.cooldown_counter)
+        self.assertEqual(scheduler.best.numpy()[0], scheduler1.best)
+        self.assertEqual(scheduler.num_bad_epochs, scheduler1.num_bad_epochs)
+        self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch)
+        self.assertEqual(scheduler.last_lr, scheduler1.last_lr)
+
+
+def noam_lr(epoch_num, d_model, warmup_steps, learning_rate=1.0, verbose=False):
+    if epoch_num == 0:
+        a = 1
+    else:
+        a = math.pow(epoch_num, -0.5)
+    b = math.pow(warmup_steps, -1.5) * epoch_num
+    return learning_rate * math.pow(d_model, -0.5) * min(a, b)
+
+
+def lambda_lr(epoch_num, learning_rate, lr_lambda, verbose=False):
+    return learning_rate * lr_lambda(epoch_num)
+
+
+def piecewise_lr(epoch_num, boundaries, values, verbose=False):
+    assert len(boundaries) + 1 == len(values)
+    for i in range(len(boundaries)):
+        if epoch_num < boundaries[i]:
+            return values[i]
+    return values[len(values) - 1]
+
+
+def exponential_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate * gamma**epoch_num
+
+
+def natural_exp_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate * math.exp(-1 * gamma * epoch_num)
+
+
+def inverse_time_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate / (1 + gamma * epoch_num)
+
+
+def polynomial_lr(epoch_num,
+                  learning_rate,
+                  decay_steps,
+                  end_lr=0.0001,
+                  power=1.0,
+                  cycle=False,
+                  verbose=False):
+
+    if cycle:
+        div = math.ceil(epoch_num / float(decay_steps))
+        if epoch_num == 0:
+            div = 1
+        decay_steps = decay_steps * div
+    else:
+        epoch_num = min(epoch_num, decay_steps)
+    return (learning_rate - end_lr) * (
+        (1 - float(epoch_num) / float(decay_steps))**power) + end_lr
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            return self.base_lr
+        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
+                math.pi / self.T_max)) / 2
+
+        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
+            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
+                self.last_lr - self.eta_min) + self.eta_min
+
+
+cosine_annealing_lr_current = None
+
+
+def cosine_annealing_lr(epoch_num,
+                        learning_rate,
+                        T_max,
+                        eta_min=0,
+                        verbose=False):
+    global cosine_annealing_lr_current
+    if epoch_num == 0:
+        cosine_annealing_lr_current = learning_rate
+    elif (epoch_num - 1 - T_max) % (2 * T_max) == 0:
+        cosine_annealing_lr_current = cosine_annealing_lr_current + (
+            learning_rate - eta_min) * (1 - math.cos(math.pi / float(T_max))
+                                        ) / 2
+    else:
+        cosine_annealing_lr_current = (1 + math.cos(
+            math.pi * epoch_num / float(T_max))) / (1 + math.cos(math.pi * (
+                epoch_num - 1) / float(T_max))) * (cosine_annealing_lr_current -
+                                                   eta_min) + eta_min
+    return cosine_annealing_lr_current
+
+
+def linear_warmup_lr(epoch_num,
+                     learning_rate,
+                     warmup_steps,
+                     start_lr,
+                     end_lr,
+                     verbose=False):
+    if epoch_num < warmup_steps:
+        return start_lr + (end_lr - start_lr) * (float(epoch_num) /
+                                                 float(warmup_steps))
+    else:
+        return learning_rate
+
+
+def multi_step_lr(epoch_num,
+                  learning_rate,
+                  milestones,
+                  gamma=0.1,
+                  verbose=False):
+    for i in range(len(milestones)):
+        if epoch_num < milestones[i]:
+            return learning_rate * (gamma**i)
+    return learning_rate * (gamma**len(milestones))
+
+
+def step_lr(epoch_num, learning_rate, step_size, gamma=0.1, verbose=False):
+    return learning_rate * math.pow(gamma, epoch_num // step_size)
+
+
+class TestLRScheduler(unittest.TestCase):
+    def _test_static(self, python_func, paddle_api, kwarg, place):
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            x = paddle.static.data(name='x', shape=[3, 4, 5])
+            y = paddle.static.data(name='y', shape=[3, 4, 5])
+            z = paddle.static.nn.fc(x, 100)
+            loss = paddle.mean(z)
+            scheduler = paddle_api(**kwarg)
+            adam = paddle.optimizer.Adam(learning_rate=scheduler)
+            adam.minimize(loss)
+            lr_var = adam._global_learning_rate()
+            test_prog = main_prog.clone()
+
+        num = 0
+        exe = paddle.static.Executor(place)
+        exe.run(start_prog)
+        for epoch in range(5):
+            for batch_id in range(2):
+                out = exe.run(
+                    main_prog,
+                    feed={
+                        'x': np.random.randn(3, 4, 5).astype('float32'),
+                        'y': np.random.randn(3, 4, 5).astype('float32')
+                    },
+                    fetch_list=lr_var.name)
+            self.assertEqual(out, np.array(python_func(num, **kwarg)))
+            scheduler.step()
+            num += 1
+
+        for epoch in range(5):
+            for batch_id in range(2):
+                out = exe.run(
+                    test_prog,
+                    feed={
+                        'x': np.random.randn(3, 4, 5).astype('float32'),
+                        'y': np.random.randn(3, 4, 5).astype('float32')
+                    },
+                    fetch_list=lr_var.name)
+            self.assertEqual(out, np.array(python_func(num, **kwarg)))
+            scheduler.step()
+            num += 1
+
+        if isinstance(place, paddle.CPUPlace):
+            compiled_train_prog = paddle.static.CompiledProgram(
+                main_prog).with_data_parallel(
+                    loss_name=loss.name, places=fluid.cpu_places(4))
+            for epoch in range(5):
+                python_result = python_func(num, **kwarg)
+                for batch_id in range(2):
+                    _ = exe.run(
+                        compiled_train_prog,
+                        feed={
+                            'x': np.random.randn(12, 4, 5).astype('float32'),
+                            'y': np.random.randn(12, 4, 5).astype('float32')
+                        },
+                        fetch_list=lr_var.name)
+                scopes = compiled_train_prog._executor.local_scopes()
+                out = np.array(scopes[0].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[1].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[2].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[3].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                scheduler.step()
+                num += 1
+
+            compiled_test_prog = paddle.static.CompiledProgram(
+                test_prog).with_data_parallel(
+                    loss_name=loss.name,
+                    share_vars_from=compiled_train_prog,
+                    places=fluid.cpu_places(4))
+            for epoch in range(5):
+                python_result = python_func(num, **kwarg)
+                for batch_id in range(2):
+                    _ = exe.run(
+                        compiled_test_prog,
+                        feed={
+                            'x': np.random.randn(12, 4, 5).astype('float32'),
+                            'y': np.random.randn(12, 4, 5).astype('float32')
+                        },
+                        fetch_list=lr_var.name)
+                scopes = compiled_test_prog._executor.local_scopes()
+                out = np.array(scopes[0].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[1].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[2].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[3].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                scheduler.step()
+                num += 1
+
+    def _test_dygraph(self, python_func, paddle_api, kwarg, place):
+        paddle.disable_static(place)
+        x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+        linear = paddle.nn.Linear(10, 10)
+        scheduler = paddle_api(**kwarg)
+        adam = paddle.optimizer.Adam(
+            learning_rate=scheduler, parameters=linear.parameters())
+        for epoch in range(20):
+            for batch_id in range(2):
+                x = paddle.to_tensor(x)
+                out = linear(x)
+                loss = paddle.reduce_mean(out)
+                loss.backward()
+                adam.step()
+                adam.clear_grad()
+            current_lr = adam.get_lr()
+            expected_lr = python_func(epoch, **kwarg)
+            if paddle_api.__name__ != "CosineAnnealingLR":
+                self.assertEqual(current_lr, expected_lr)
+                scheduler.step()
+            else:
+                self.assertAlmostEqual(current_lr, expected_lr)
+                scheduler.step(epoch + 1)
+
+    def test_scheduler(self):
+        with self.assertRaises(NotImplementedError):
+            paddle.optimizer.lr_scheduler._LRScheduler().step()
+        with self.assertRaises(TypeError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate="test", milestones=[1, 2, 3])
+        with self.assertRaises(TypeError):
+            paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test')
+        with self.assertRaises(ValueError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate=0.5, milestones=[3, 2, 1])
+        with self.assertRaises(ValueError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
+
+        func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, {
+            "d_model": 0.01,
+            "warmup_steps": 100,
+            "verbose": False
+        }), (piecewise_lr, paddle.optimizer.PiecewiseLR, {
+            "boundaries": [3, 6, 9, 15, 20],
+            "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+            "verbose": False
+        }), (natural_exp_lr, paddle.optimizer.NaturalExpLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.1,
+            "verbose": True
+        }), (inverse_time_lr, paddle.optimizer.InverseTimeLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.1,
+            "verbose": False
+        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
+            "learning_rate": 0.5,
+            "decay_steps": 20,
+            "end_lr": 0,
+            "power": 1.0,
+            "cycle": False,
+            "verbose": True
+        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
+            "learning_rate": 0.5,
+            "decay_steps": 20,
+            "end_lr": 0,
+            "power": 1.0,
+            "cycle": True,
+            "verbose": False
+        }), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, {
+            'learning_rate': 0.5,
+            'warmup_steps': 20,
+            'start_lr': 0,
+            'end_lr': 0.5,
+            "verbose": True
+        }), (exponential_lr, paddle.optimizer.ExponentialLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.9,
+            "verbose": False
+        }), (multi_step_lr, paddle.optimizer.MultiStepLR, {
+            "learning_rate": 0.5,
+            "milestones": [3, 6, 9, 15, 20],
+            "gamma": 0.8,
+            "verbose": True
+        }), (step_lr, paddle.optimizer.StepLR, {
+            "learning_rate": 0.5,
+            "step_size": 2,
+            "gamma": 0.8,
+            "verbose": False
+        }), (lambda_lr, paddle.optimizer.LambdaLR, {
+            "learning_rate": 0.5,
+            "lr_lambda": lambda x: 0.95**x,
+            "verbose": True
+        }), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, {
+            "learning_rate": 0.5,
+            "T_max": 10,
+            "verbose": False
+        })]
+
+        for python_func, paddle_api, kwarg in func_api_kwargs:
+            places = [paddle.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(paddle.CUDAPlace(0))
+
+            for place in places:
+                paddle.enable_static()
+                #self._test_static(python_func, paddle_api, kwarg, place)
+                paddle.disable_static(place)
+                self._test_dygraph(python_func, paddle_api, kwarg, place)
+                paddle.enable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_manual_seed.py b/python/paddle/fluid/tests/unittests/test_manual_seed.py
index 747026622e4653491feecb852dde67a19b8ff666..a1d6eb915ce78a2b19092f20f8cfeed0d60979b2 100644
--- a/python/paddle/fluid/tests/unittests/test_manual_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_manual_seed.py
@@ -15,30 +15,33 @@
 from __future__ import print_function
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 from paddle.framework import manual_seed
 from paddle.fluid.framework import Program, default_main_program, default_startup_program
+import numpy as np
 
 
 class TestManualSeed(unittest.TestCase):
     def test_manual_seed(self):
-        local_program = Program()
-        local_main_prog = default_main_program()
-        local_start_prog = default_startup_program()
-
-        self.assertEqual(0, local_program.random_seed)
-        self.assertEqual(0, local_main_prog.random_seed)
-        self.assertEqual(0, local_start_prog.random_seed)
-
-        manual_seed(102)
-        global_program1 = Program()
-        global_program2 = Program()
-        global_main_prog = default_main_program()
-        global_start_prog = default_startup_program()
-        self.assertEqual(102, global_program1.random_seed)
-        self.assertEqual(102, global_program2.random_seed)
-        self.assertEqual(102, global_main_prog.random_seed)
-        self.assertEqual(102, global_start_prog.random_seed)
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(12312321111)
+        x = fluid.layers.gaussian_random([10], dtype="float32")
+        st1 = gen.get_state()
+        x1 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.set_state(st1)
+        x2 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.manual_seed(12312321111)
+        x3 = fluid.layers.gaussian_random([10], dtype="float32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not fluid.core.is_compiled_with_cuda():
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index 77ec6f9b6bcda7568325698634fd4f86557cd1be..a535ef5e60397718e97100332b945b360838bbf4 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -19,6 +19,8 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
 
 
 class TestMomentumOp1(OpTest):
@@ -234,5 +236,48 @@ class TestSparseMomentumOp2(TestSparseMomentumOp):
         self.use_nesterov = True
 
 
+class TestMomentumV2(unittest.TestCase):
+    def test_momentum_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9, parameters=linear.parameters())
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_momentum(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.Momentum(
+                learning_rate=0.1, momentum=0.9)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(
+            ValueError, paddle.optimizer.Momentum, learning_rate=None)
+        self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e2f9562b453b7faf40d4fc421dcea4967724025
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.io import TensorDataset, DataLoader
+from paddle.fluid.dygraph.base import to_variable
+
+
+class TestTensorDataset(unittest.TestCase):
+    def run_main(self, num_workers, places):
+        fluid.default_startup_program().random_seed = 1
+        fluid.default_main_program().random_seed = 1
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([16, 3, 4]).astype('float32')
+            input = to_variable(input_np)
+            label_np = np.random.random([16, 1]).astype('int32')
+            label = to_variable(label_np)
+
+            dataset = TensorDataset([input, label])
+            assert len(dataset) == 16
+            dataloader = DataLoader(
+                dataset,
+                places=place,
+                num_workers=num_workers,
+                batch_size=1,
+                drop_last=True)
+
+            for i, (input, label) in enumerate(dataloader()):
+                assert len(input) == 1
+                assert len(label) == 1
+                assert input.shape == [1, 3, 4]
+                assert label.shape == [1, 1]
+                assert isinstance(input, paddle.Tensor)
+                assert isinstance(label, paddle.Tensor)
+                assert np.allclose(input.numpy(), input_np[i])
+                assert np.allclose(label.numpy(), label_np[i])
+
+    def test_main(self):
+        for p in [fluid.CPUPlace(), fluid.CUDAPlace(0)]:
+            for num_workers in [0, 2]:
+                ret = self.run_main(num_workers=num_workers, places=p)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..19da09a463f3cc6224a22eb90278abae9ec59b91
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -0,0 +1,556 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle
+
+
+def nearest_neighbor_interp_np(X,
+                               out_h,
+                               out_w,
+                               out_size=None,
+                               actual_shape=None,
+                               align_corners=True,
+                               data_layout='NCHW'):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        X = np.transpose(X, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    n, c, in_h, in_w = X.shape
+
+    ratio_h = ratio_w = 0.0
+    if (out_h > 1):
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+    if (out_w > 1):
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((n, c, out_h, out_w))
+
+    if align_corners:
+        for i in range(out_h):
+            in_i = int(ratio_h * i + 0.5)
+            for j in range(out_w):
+                in_j = int(ratio_w * j + 0.5)
+                out[:, :, i, j] = X[:, :, in_i, in_j]
+    else:
+        for i in range(out_h):
+            in_i = int(ratio_h * i)
+            for j in range(out_w):
+                in_j = int(ratio_w * j)
+                out[:, :, i, j] = X[:, :, in_i, in_j]
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(X.dtype)
+
+
+class TestNearestInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "nearest_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float64")
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_np(
+            input_np, out_h, out_w, self.out_size, self.actual_shape,
+            self.align_corners, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'data_layout': self.data_layout
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 4, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([65, 129]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpSame(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 4, 4, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 8]).astype("int32")
+        self.align_corners = True
+        self.data_layout = "NHWC"
+
+
+class TestNearestInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp_v2"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 80
+        self.out_w = 40
+        self.scale = 0.
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.scale = 0.
+        self.out_size = np.array([6, 15]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+    def set_align_corners(self):
+        self.align_corners = False
+
+
+class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 7, 5]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 5, 7]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 1.5
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 7, 5]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [2.0, 3.0]
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestInterpOp_attr_tensor(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp_v2"
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+        }
+
+        input_np = np.random.random(self.input_shape).astype("float64")
+        self.inputs = {'X': input_np}
+
+        if self.scale_by_1Dtensor:
+            self.inputs['Scale'] = np.array([self.scale]).astype("float64")
+        elif self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        if self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.out_size is not None:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs['out_h'] = self.out_h
+        self.attrs['out_w'] = self.out_w
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners)
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 5, 4, 4]
+        self.out_h = 3
+        self.out_w = 3
+        self.scale = 0.
+        self.out_size = [3, 3]
+        self.align_corners = True
+
+
+# out_size is a tensor list
+class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = [8, 12]
+        self.align_corners = True
+
+
+# out_size is a 1-D tensor
+class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.shape_by_1Dtensor = True
+
+
+# scale is a 1-D tensor
+class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.0
+        self.out_size = None
+        self.align_corners = True
+        self.scale_by_1Dtensor = True
+
+
+class TestNearestAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+        y = fluid.data(name="y", shape=[2, 6, 6, 3], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_nearest(
+            y, out_shape=[12, 12], data_format='NHWC')
+        out2 = fluid.layers.resize_nearest(x, out_shape=[12, dim])
+        out3 = fluid.layers.resize_nearest(x, out_shape=shape_tensor)
+        out4 = fluid.layers.resize_nearest(
+            x, out_shape=[4, 4], actual_shape=actual_size)
+        out5 = fluid.layers.resize_nearest(x, scale=scale_tensor)
+
+        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+        dim_data = np.array([12]).astype("int32")
+        shape_data = np.array([12, 12]).astype("int32")
+        actual_size_data = np.array([12, 12]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x": x_data,
+                              "y": np.transpose(x_data, (0, 2, 3, 1)),
+                              "dim": dim_data,
+                              "shape_tensor": shape_data,
+                              "actual_size": actual_size_data,
+                              "scale_tensor": scale_data
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5],
+                          return_numpy=True)
+
+        expect_res = nearest_neighbor_interp_np(
+            x_data, out_h=12, out_w=12, align_corners=True)
+        self.assertTrue(
+            np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1))))
+        for i in range(len(results) - 1):
+            self.assertTrue(np.allclose(results[i + 1], expect_res))
+
+
+class TestUpsampleNearest2dInterpOpAPI2_0(unittest.TestCase):
+    def test_case(self):
+
+        # dygraph
+        x_data = np.random.random((1, 3, 6, 6)).astype("float32")
+        upsample = paddle.nn.UpsamplingNearest2d(scale_factor=[2, 2])
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(x_data)
+            interp = upsample(x)
+            expect = nearest_neighbor_interp_np(
+                x_data, out_h=12, out_w=12, align_corners=False)
+            self.assertTrue(np.allclose(interp.numpy(), expect))
+
+
+class TestNearestInterpException(unittest.TestCase):
+    def test_exception(self):
+        input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
+
+        def attr_data_format():
+            # for 4-D input, data_format can only be NCHW or NHWC
+            out = fluid.layers.resize_nearest(
+                input, out_shape=[4, 8], data_format='NDHWC')
+
+        def attr_scale_type():
+            out = fluid.layers.resize_nearest(input, scale='scale')
+
+        def attr_scale_value():
+            out = fluid.layers.resize_nearest(input, scale=-0.3)
+
+        self.assertRaises(ValueError, attr_data_format)
+        self.assertRaises(TypeError, attr_scale_type)
+        self.assertRaises(ValueError, attr_scale_value)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index 0d083038c6131215dcb572eca1e782c43e82d20a..c047cf6ddff78641b918de75a284574175bb3bca 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -22,9 +22,40 @@ import paddle.fluid as fluid
 
 
 def p_norm(x, axis, porder, keepdims=False):
-    if axis is None: axis = -1
-    r = np.linalg.norm(
-        x, ord=porder, axis=axis, keepdims=keepdims).astype(x.dtype)
+    r = []
+    if axis is None:
+        x = x.flatten()
+        if porder == np.inf:
+            r = np.amax(np.abs(x))
+        elif porder == -np.inf:
+            r = np.amin(np.abs(x))
+        else:
+            r = np.linalg.norm(x, ord=porder)
+    elif isinstance(axis, list or tuple) and len(axis) == 2:
+        if porder == np.inf:
+            axis = tuple(axis)
+            r = np.amax(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == -np.inf:
+            axis = tuple(axis)
+            r = np.amin(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == 0:
+            axis = tuple(axis)
+            r = x.astype(bool)
+            r = np.sum(r, axis)
+        elif porder == 1:
+            axis = tuple(axis)
+            r = np.sum(np.abs(x), axis)
+        else:
+            axis = tuple(axis)
+            xp = np.power(np.abs(x), porder)
+            s = np.sum(xp, axis=axis, keepdims=keepdims)
+            r = np.power(s, 1.0 / porder)
+    else:
+        if isinstance(axis, list):
+            axis = tuple(axis)
+        r = np.linalg.norm(
+            x, ord=porder, axis=axis, keepdims=keepdims).astype(x.dtype)
+
     return r
 
 
@@ -186,22 +217,10 @@ class TestPnormOp5(TestPnormOp):
         self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
 
 
-def run_out(self, p, axis, shape_x, shape_y, dtype):
-    with fluid.program_guard(fluid.Program()):
-        data1 = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        data2 = fluid.data(name="Y", shape=shape_y, dtype=dtype)
-        out = paddle.norm(input=data1, p=p, axis=axis, out=data2)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        result = exe.run(feed={"X": np.random.rand(*shape_x).astype(dtype)},
-                         fetch_list=[data2, out])
-        self.assertEqual((result[0] == result[1]).all(), True)
-
-
 def run_fro(self, p, axis, shape_x, dtype):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(input=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
@@ -213,35 +232,73 @@ def run_fro(self, p, axis, shape_x, dtype):
 def run_pnorm(self, p, axis, shape_x, dtype):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(input=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
         expected_result = p_norm(np_input, porder=p, axis=axis).astype(dtype)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
-    self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+        self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+
+
+def run_graph(self, p, axis, shape_x, dtype):
+    paddle.disable_static()
+    shape = [2, 3, 4]
+    np_input = np.arange(24).astype('float32') - 12
+    np_input = np_input.reshape(shape)
+    x = paddle.to_tensor(np_input)
+    #[[[-12. -11. -10.  -9.] [ -8.  -7.  -6.  -5.] [ -4.  -3.  -2.  -1.]]
+    # [[  0.   1.   2.   3.] [  4.   5.   6.   7.] [  8.   9.  10.  11.]]]
+    out_pnorm = paddle.norm(x, p=2, axis=-1)
+
+    # compute frobenius norm along last two dimensions.
+    out_fro = paddle.norm(x, p='fro')
+    out_fro = paddle.norm(x, p='fro', axis=[0, 1])
+    # compute 2-order  norm along [0,1] dimension.
+    out_pnorm = paddle.norm(x, p=2, axis=[0, 1])
+    out_pnorm = paddle.norm(x, p=2)
+    #out_pnorm = [17.43559577 16.91153453 16.73320053 16.91153453]
+    # compute inf-order  norm
+    out_pnorm = paddle.norm(x, p=np.inf)
+    #out_pnorm = [12.]
+    out_pnorm = paddle.norm(x, p=np.inf, axis=0)
+    #out_pnorm = [[0. 1. 2. 3.] [4. 5. 6. 5.] [4. 3. 2. 1.]]
+
+    # compute -inf-order  norm
+    out_pnorm = paddle.norm(x, p=-np.inf)
+    #out_pnorm = [0.]
+    out_pnorm = paddle.norm(x, p=-np.inf, axis=0)
+    # out_fro = [17.43559577 16.91153453 16.73320053 16.91153453]
+    paddle.enable_static()
 
 
 class API_NormTest(unittest.TestCase):
-    def test_output_result(self):
-        run_out(self, p=2, axis=1, shape_x=[3, 4], shape_y=[3], dtype="float32")
-        run_out(
-            self,
-            p='fro',
-            axis=None,
-            shape_x=[3, 4],
-            shape_y=[1],
-            dtype="float32")
-
     def test_basic(self):
-        run_fro(self, p='fro', axis=None, shape_x=[3, 3, 4], dtype="float32")
-        run_fro(self, p='fro', axis=[0, 1], shape_x=[3, 3, 4], dtype="float64")
+        run_fro(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
+        run_fro(self, p='fro', axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
         run_pnorm(self, p=2, axis=None, shape_x=[3, 4], dtype="float32")
         run_pnorm(self, p=2, axis=1, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=np.inf, axis=1, shape_x=[3, 4], dtype="float32")
-        run_pnorm(self, p=-np.inf, axis=1, shape_x=[3, 4], dtype="float64")
+        run_pnorm(self, p=np.inf, axis=0, shape_x=[2, 3, 4], dtype="float32")
+        run_pnorm(self, p=np.inf, axis=None, shape_x=[2, 3, 4], dtype="float32")
+        run_pnorm(self, p=-np.inf, axis=0, shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(
+            self, p=-np.inf, axis=None, shape_x=[2, 3, 4], dtype="float64")
         run_pnorm(self, p=0, axis=1, shape_x=[3, 4], dtype="float64")
 
+        run_pnorm(self, p=1, axis=1, shape_x=[3, 4], dtype="float64")
+        run_pnorm(self, p=0, axis=None, shape_x=[3, 4], dtype="float64")
+        run_pnorm(self, p=2, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(self, p=2, axis=-1, shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(self, p=1, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(self, p=0, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(
+            self, p=np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        run_pnorm(
+            self, p=-np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+
+    def test_dygraph(self):
+        run_graph(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
+
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
             x = fluid.data(name="x", shape=[10, 10], dtype="float32")
@@ -268,11 +325,7 @@ class API_NormTest(unittest.TestCase):
             self.assertRaises(ValueError, paddle.norm, data, p="unsupport norm")
             self.assertRaises(ValueError, paddle.norm, data, p=[1])
             self.assertRaises(ValueError, paddle.norm, data, p=[1], axis=-1)
-            self.assertRaises(
-                ValueError, paddle.norm, data, p='unspport', axis=[-2, -1])
             data = fluid.data(name="data_3d", shape=[2, 2, 2], dtype="float64")
-            self.assertRaises(
-                ValueError, paddle.norm, data, p='unspport', axis=[-2, -1])
             self.assertRaises(
                 ValueError, paddle.norm, data, p='unspport', axis=[-3, -2, -1])
 
diff --git a/python/paddle/fluid/tests/unittests/test_normal.py b/python/paddle/fluid/tests/unittests/test_normal.py
index a9d9af4d50be77bd1d2ecc11dd872ef612209f1e..3e6855feaf491727203063f5c75c68301abbe05e 100644
--- a/python/paddle/fluid/tests/unittests/test_normal.py
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
@@ -25,7 +25,7 @@ class TestNormalAPI(unittest.TestCase):
         self.mean = 1.0
         self.std = 0.0
         self.shape = None
-        self.repeat_num = 1000
+        self.repeat_num = 2000
         self.set_attrs()
         self.dtype = self.get_dtype()
         self.place=paddle.CUDAPlace(0) \
@@ -134,8 +134,8 @@ class TestNormalAPI(unittest.TestCase):
                 if isinstance(self.mean, np.ndarray) else self.mean
             std_ref=self.std.reshape([1, -1]) \
                 if isinstance(self.std, np.ndarray) else self.std
-            self.assertTrue(np.allclose(mean_ref, mean, 0.1, 0.1))
-            self.assertTrue(np.allclose(std_ref, std, 0.1, 0.1))
+            self.assertTrue(np.allclose(mean_ref, mean, 0.2, 0.2))
+            self.assertTrue(np.allclose(std_ref, std, 0.2, 0.2))
 
 
 class TestNormalAPI_mean_is_tensor(TestNormalAPI):
diff --git a/python/paddle/fluid/tests/unittests/test_normalize.py b/python/paddle/fluid/tests/unittests/test_normalize.py
index 6595a29b24ae23c9b38538035c9593ba77eecdb7..614e0e897613b235e2ec6fa72cfaf1057e7d5bbd 100644
--- a/python/paddle/fluid/tests/unittests/test_normalize.py
+++ b/python/paddle/fluid/tests/unittests/test_normalize.py
@@ -23,8 +23,6 @@ import numpy as np
 
 
 def p_normalize(x, axis=1, p=2, epsilon=1e-12, keepdims=True):
-    if len(x.shape) == 1:
-        axis = 0
     xp = np.power(np.abs(x), p)
     s = np.sum(xp, axis=axis, keepdims=keepdims)
     r = np.maximum(np.power(s, 1.0 / p), epsilon)
@@ -38,10 +36,10 @@ class TestNNFunctionalNormalize(unittest.TestCase):
         self.expected0 = p_normalize(self.input_np)
         self.expected1 = p_normalize(self.input_np, p=1.5)
         self.expected2 = p_normalize(self.input_np, axis=0)
-        self.expected3 = p_normalize(self.input_np2)
+        self.expected3 = p_normalize(self.input_np2, axis=0)
 
     def run_imperative(self):
-        x = paddle.to_variable(self.input_np)
+        x = paddle.to_tensor(self.input_np)
         y = F.normalize(x)
         self.assertTrue(np.allclose(y.numpy(), self.expected0))
 
@@ -51,10 +49,12 @@ class TestNNFunctionalNormalize(unittest.TestCase):
         y = F.normalize(x, axis=0)
         self.assertTrue(np.allclose(y.numpy(), self.expected2))
 
-        x = paddle.to_variable(self.input_np2)
-        y = F.normalize(x)
+        x = paddle.to_tensor(self.input_np2)
+        y = F.normalize(x, axis=0)
         self.assertTrue(np.allclose(y.numpy(), self.expected3))
 
+        self.assertRaises(BaseException, F.normalize, x)
+
     def run_static(self, use_gpu=False):
         x = paddle.data(name='input', shape=[10, 10], dtype='float32')
         x2 = paddle.data(name='input2', shape=[2], dtype='float32')
@@ -62,7 +62,7 @@ class TestNNFunctionalNormalize(unittest.TestCase):
         result1 = F.normalize(x, p=1.5)
         result2 = F.normalize(x, axis=0)
         result3 = F.normalize(x, name='aaa')
-        result4 = F.normalize(x2)
+        result4 = F.normalize(x2, axis=0)
 
         place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -77,6 +77,7 @@ class TestNNFunctionalNormalize(unittest.TestCase):
         self.assertTrue(np.allclose(static_result[2], self.expected2))
         self.assertTrue('aaa' in result3.name)
         self.assertTrue(np.allclose(static_result[3], self.expected3))
+        self.assertRaises(ValueError, F.normalize, x2)
 
     def test_cpu(self):
         paddle.disable_static(place=paddle.fluid.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_pad3d_op.py b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
index 68589e6d8182f9e6dfdabfc7bce4c20bec521740..11719a9c4a92807375c1fdfcc7e168dccc5e522c 100644
--- a/python/paddle/fluid/tests/unittests/test_pad3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
@@ -166,7 +166,11 @@ class TestPadAPI(unittest.TestCase):
             value = 100
             input_data = np.random.rand(*input_shape).astype(np.float32)
             x = paddle.data(name="x", shape=input_shape)
-            result = F.pad(x=x, pad=pad, value=value, mode=mode)
+            result = F.pad(x=x,
+                           pad=pad,
+                           value=value,
+                           mode=mode,
+                           data_format="NCDHW")
             exe = Executor(place)
             fetches = exe.run(default_main_program(),
                               feed={"x": input_data},
@@ -666,5 +670,44 @@ class TestPad3dOpError(unittest.TestCase):
         self.assertRaises(Exception, test_reflect_3)
 
 
+class TestPadDataformatError(unittest.TestCase):
+    def test_errors(self):
+        def test_ncl():
+            paddle.disable_static(paddle.CPUPlace())
+            input_shape = (1, 2, 3, 4)
+            pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
+            data = np.arange(
+                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCL")
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+
+        def test_nchw():
+            paddle.disable_static(paddle.CPUPlace())
+            input_shape = (1, 2, 4)
+            pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
+            data = np.arange(
+                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCHW")
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+
+        def test_ncdhw():
+            paddle.disable_static(paddle.CPUPlace())
+            input_shape = (1, 2, 3, 4)
+            pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
+            data = np.arange(
+                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCDHW")
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+
+        self.assertRaises(AssertionError, test_ncl)
+
+        self.assertRaises(AssertionError, test_nchw)
+
+        self.assertRaises(AssertionError, test_ncdhw)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
index 858d56c1fc04f61c9dd281a633f7be9aceff8338..2ffe523ef6dda18a24813e702a1892c335ba6a68 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
@@ -52,8 +52,6 @@ class TestDygraphDoubleGrad(TestCase):
              retain_graph=None,
              create_graph=False,
              allow_unused=False):
-        backward_strategy = fluid.dygraph.BackwardStrategy()
-        backward_strategy.sort_sum_gradient = self.sort_sum_gradient
         return paddle.grad(
             outputs=outputs,
             inputs=inputs,
@@ -61,8 +59,7 @@ class TestDygraphDoubleGrad(TestCase):
             no_grad_vars=no_grad_vars,
             retain_graph=retain_graph,
             create_graph=create_graph,
-            allow_unused=allow_unused,
-            backward_strategy=backward_strategy)
+            allow_unused=allow_unused)
 
     @dygraph_guard
     def test_exception(self):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index 5677157fde8d718cbeb6691849421efbbb47dc10..bac196b1ab52b604a85321a5473d455d2616bf0d 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -13,11 +13,16 @@
 # limitations under the License.
 
 from __future__ import print_function
+
+import os
+import sys
 import unittest
-from test_dist_base import TestDistBase
+
 import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_mnist import TestMnist
 
-import os
 flag_name = os.path.splitext(__file__)[0]
 
 
@@ -36,5 +41,11 @@ class TestParallelDygraphMnist(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphMnistSpawn(TestDistSpawnRunner):
+    def test_mnist_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(test_class=TestMnist, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
index 8c5cdf8321a4bdff8981b3b0dafed66d067a41e3..cf89dc484c4880092962eb0322b98bac1d15dcd3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
@@ -13,11 +13,16 @@
 # limitations under the License.
 
 from __future__ import print_function
+
+import os
+import sys
 import unittest
-from test_dist_base import TestDistBase
+
 import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_se_resnext import TestSeResNeXt
 
-import os
 flag_name = os.path.splitext(__file__)[0]
 
 
@@ -36,5 +41,12 @@ class TestParallelDygraphSeResNeXt(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphSeResNeXtSpawn(TestDistSpawnRunner):
+    def test_se_resnext_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestSeResNeXt, delta=0.01)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
index 40b5833053d29bb36c5f15aca23aadb761597838..7f051f1005c7b7591bd813e1c4677e383c439ed0 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
@@ -15,10 +15,13 @@
 from __future__ import print_function
 
 import os
+import sys
 import unittest
-import paddle.fluid as fluid
 
+import paddle.fluid as fluid
 from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_sparse_embedding import TestSparseEmbedding
 
 flag_name = os.path.splitext(__file__)[0]
 
@@ -38,5 +41,12 @@ class TestParallelDygraphSparseEmdedding(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphSparseEmdeddingSpawn(TestDistSpawnRunner):
+    def test_sparse_embedding_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestSparseEmbedding, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
index 385c4d892a650bc18400f4f59cd2ed10bea24d70..c8d47eab2c5191e941d188845a927b5202af9738 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
@@ -15,10 +15,13 @@
 from __future__ import print_function
 
 import os
+import sys
 import unittest
-import paddle.fluid as fluid
 
+import paddle.fluid as fluid
 from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_transformer import TestTransformer
 
 flag_name = os.path.splitext(__file__)[0]
 
@@ -38,5 +41,12 @@ class TestParallelDygraphTransformer(TestDistBase):
                 log_name=flag_name)
 
 
+class TestParallelDygraphTransformerSpawn(TestDistSpawnRunner):
+    def test_transformer_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestTransformer, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index b1a25ad3529e8b0a4126bc458838ecd876e5af30..1c05b96f1fc61234028e940f6403ae08a0186027 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -174,66 +174,6 @@ class TestPool1d_API(unittest.TestCase):
             result = max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-    def check_adaptive_max_dygraph_results(self, place):
-        with fluid.dygraph.guard(place):
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            input = fluid.dygraph.to_variable(input_np)
-            result = F.adaptive_max_pool1d(input, output_size=16)
-
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
-                output_size=16)
-            result = ada_max_pool1d_dg(input)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-    def check_adaptive_avg_dygraph_results(self, place):
-        with fluid.dygraph.guard(place):
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            input = fluid.dygraph.to_variable(input_np)
-            result = F.adaptive_avg_pool1d(input, output_size=16)
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
-
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
-                output_size=16)
-            result = ada_max_pool1d_dg(input)
-            self.assertTrue(np.allclose(result.numpy(), result_np))
-
-    def check_adaptive_max_static_results(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
-            result = F.adaptive_max_pool1d(input, output_size=16)
-
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
-
-            exe = fluid.Executor(place)
-            fetches = exe.run(fluid.default_main_program(),
-                              feed={"input": input_np},
-                              fetch_list=[result])
-            self.assertTrue(np.allclose(fetches[0], result_np))
-
-    def check_adaptive_avg_static_results(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
-            result = F.adaptive_avg_pool1d(input, output_size=16)
-
-            input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
-
-            exe = fluid.Executor(place)
-            fetches = exe.run(fluid.default_main_program(),
-                              feed={"input": input_np},
-                              fetch_list=[result])
-            self.assertTrue(np.allclose(fetches[0], result_np))
-
     def check_max_dygraph_padding_same(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
@@ -265,10 +205,6 @@ class TestPool1d_API(unittest.TestCase):
             self.check_avg_dygraph_results(place)
             self.check_max_static_results(place)
             self.check_avg_static_results(place)
-            self.check_adaptive_max_dygraph_results(place)
-            self.check_adaptive_avg_dygraph_results(place)
-            self.check_adaptive_max_static_results(place)
-            self.check_adaptive_avg_static_results(place)
             self.check_max_dygraph_padding_same(place)
             self.check_avg_dygraph_padding_same(place)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
index 73df0885d8fed4ddc4c03c91d2c331e72772e398..93a2be6de342efc4e8284e7c352137d0a3a1bcb9 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -17,7 +17,7 @@ import unittest
 from op_test import OpTest
 import numpy as np
 import paddle.fluid.core as core
-from paddle.nn.functional import *
+from paddle.nn.functional import avg_pool2d, max_pool2d
 import paddle.fluid as fluid
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/test_pow.py b/python/paddle/fluid/tests/unittests/test_pow.py
new file mode 100755
index 0000000000000000000000000000000000000000..0764cb580e40d115d8703278380a9ced12e24201
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pow.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.tensor as tensor
+import paddle.fluid as fluid
+from paddle.static import Program, program_guard
+import numpy as np
+import unittest
+
+DYNAMIC = 1
+STATIC = 2
+
+
+def _run_power(mode, x, y):
+    # dynamic mode
+    if mode == DYNAMIC:
+        paddle.disable_static()
+        # y is scalar
+        if isinstance(y, (int, float)):
+            x_ = paddle.to_tensor(x)
+            y_ = y
+            res = paddle.pow(x_, y_)
+            return res.numpy()
+        # y is tensor
+        else:
+            x_ = paddle.to_tensor(x)
+            y_ = paddle.to_tensor(y)
+            res = paddle.pow(x_, y_)
+            return res.numpy()
+    # static mode
+    elif mode == STATIC:
+        paddle.enable_static()
+        # y is scalar
+        if isinstance(y, (int, float)):
+            with program_guard(Program(), Program()):
+                x_ = paddle.static.data(name="x", shape=x.shape, dtype=x.dtype)
+                y_ = y
+                res = paddle.pow(x_, y_)
+                place = fluid.CPUPlace()
+                exe = fluid.Executor(place)
+                outs = exe.run(feed={'x': x}, fetch_list=[res])
+                return outs[0]
+        # y is tensor
+        else:
+            with program_guard(Program(), Program()):
+                x_ = paddle.static.data(name="x", shape=x.shape, dtype=x.dtype)
+                y_ = paddle.static.data(name="y", shape=y.shape, dtype=y.dtype)
+                res = paddle.pow(x_, y_)
+                place = fluid.CPUPlace()
+                exe = fluid.Executor(place)
+                outs = exe.run(feed={'x': x, 'y': y}, fetch_list=[res])
+                return outs[0]
+
+
+class TestPowerAPI(unittest.TestCase):
+    """TestPowerAPI."""
+
+    def test_power(self):
+        """test_power."""
+        np.random.seed(7)
+        # test 1-d float tensor ** float scalar
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = np.random.rand() * 10
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d float tensor ** int scalar
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = int(np.random.rand() * 10)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = int(np.random.rand() * 10)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d float tensor ** 1-d float tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.float64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d float tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d float tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = (np.random.rand(*dims) * 10).astype(np.float64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int32)
+        y = (np.random.rand(*dims) * 10).astype(np.int32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int64)
+        y = (np.random.rand(*dims) * 10).astype(np.int32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.int32)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float32)
+        y = (np.random.rand(*dims) * 10).astype(np.float32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.float32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(*dims) * 10).astype(np.int32)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test 1-d int tensor ** 1-d int tensor
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float32)
+        y = (np.random.rand(*dims) * 10).astype(np.int64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+        # test broadcast
+        dims = (np.random.randint(1, 10), np.random.randint(5, 10),
+                np.random.randint(5, 10))
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(dims[-1]) * 10).astype(np.float64)
+        res = _run_power(DYNAMIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+        res = _run_power(STATIC, x, y)
+        self.assertTrue(np.allclose(res, np.power(x, y)))
+
+
+class TestPowerError(unittest.TestCase):
+    """TestPowerError."""
+
+    def test_errors(self):
+        """test_errors."""
+        np.random.seed(7)
+
+        # test dynamic computation graph: inputs must be broadcastable
+        dims = (np.random.randint(1, 10), np.random.randint(5, 10),
+                np.random.randint(5, 10))
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(dims[-1] + 1) * 10).astype(np.float64)
+        self.assertRaises(fluid.core.EnforceNotMet, _run_power, DYNAMIC, x, y)
+        self.assertRaises(fluid.core.EnforceNotMet, _run_power, STATIC, x, y)
+
+        # test dynamic computation graph: inputs must be broadcastable
+        dims = (np.random.randint(1, 10), np.random.randint(5, 10),
+                np.random.randint(5, 10))
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = (np.random.rand(dims[-1] + 1) * 10).astype(np.int8)
+        self.assertRaises(TypeError, paddle.pow, x, y)
+
+        # test 1-d float tensor ** int string
+        dims = (np.random.randint(200, 300), )
+        x = (np.random.rand(*dims) * 10).astype(np.float64)
+        y = int(np.random.rand() * 10)
+        self.assertRaises(TypeError, paddle.pow, x, str(y))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index 6045f2d713627cedfe169b9e066222904244311a..32d8f73552f71dd1066e19b70f1f2e4b8628950a 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -147,10 +147,8 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
 
     with fluid.program_guard(fluid.Program(), fluid.Program()):
         with fluid.scope_guard(fluid.core.Scope()):
-            fluid.default_main_program().random_seed = 1
-            fluid.default_startup_program().random_seed = 1
+            gen = paddle.manual_seed(1)
             np.random.seed(1)
-
             img = fluid.layers.data(name='image', shape=[784], dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             loss = simple_fc_net(img, label, use_py_func_op)
@@ -189,17 +187,17 @@ class TestPyFuncOpUseExecutor(unittest.TestCase):
         self.use_parallel_executor = False
 
     def test_loss_diff(self):
-        losses = []
         for use_cuda in [True, False]:
+            losses = []
             for use_py_func_op in [True, False]:
                 L = test_main(use_cuda, use_py_func_op,
                               self.use_parallel_executor)
                 if L is not None:
                     losses.append(L)
 
-        for idx in six.moves.range(len(losses) - 1):
-            max_diff = np.max(np.abs(losses[idx] - losses[0]))
-            self.assertAlmostEqual(max_diff, 0, delta=1e-3)
+                for idx in six.moves.range(len(losses) - 1):
+                    max_diff = np.max(np.abs(losses[idx] - losses[0]))
+                    self.assertAlmostEqual(max_diff, 0, delta=1e-3)
 
 
 class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor):
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
index 2933abe46c1b87959c9f61975c02a41e91dfbef3..343508bf619b6a7328016464794221f7284435b2 100644
--- a/python/paddle/fluid/tests/unittests/test_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -26,27 +26,31 @@ import paddle.fluid.core as core
 
 
 class TestGeneratorSeed(unittest.TestCase):
-    """
-    Test cases for cpu generator seed.
-    """
+    #     """
+    #     Test cases for cpu generator seed.
+    #     """
 
     def test_generator_uniform_random_dygraph(self):
         """Test Generator seed."""
-        gen = generator.Generator()
 
         fluid.enable_dygraph()
 
-        gen.manual_seed(12312321111)
+        gen = paddle.manual_seed(12312321111)
         x = fluid.layers.uniform_random([10], dtype="float32", min=0.0, max=1.0)
+
         st1 = gen.get_state()
         x1 = fluid.layers.uniform_random(
             [10], dtype="float32", min=0.0, max=1.0)
+
         gen.set_state(st1)
+        print(gen.get_state())
         x2 = fluid.layers.uniform_random(
             [10], dtype="float32", min=0.0, max=1.0)
-        gen.manual_seed(12312321111)
+
+        paddle.manual_seed(12312321111)
         x3 = fluid.layers.uniform_random(
             [10], dtype="float32", min=0.0, max=1.0)
+
         x_np = x.numpy()
         x1_np = x1.numpy()
         x2_np = x2.numpy()
@@ -57,11 +61,9 @@ class TestGeneratorSeed(unittest.TestCase):
             self.assertTrue(np.allclose(x_np, x3_np))
 
     def test_generator_uniform_random_static(self):
-
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        gen = paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -93,11 +95,9 @@ class TestGeneratorSeed(unittest.TestCase):
                 self.assertTrue(not np.allclose(out1_res2, out1_res1))
 
     def test_gen_dropout_dygraph(self):
-        gen = generator.Generator()
-
         fluid.enable_dygraph()
 
-        gen.manual_seed(111111111)
+        gen = paddle.manual_seed(111111111)
         st = gen.get_state()
         # x = np.arange(1,101).reshape(2,50).astype("float32")
         x = fluid.layers.uniform_random(
@@ -110,8 +110,7 @@ class TestGeneratorSeed(unittest.TestCase):
         y1 = fluid.layers.dropout(x1, 0.5)
         y_np = y.numpy()
         y1_np = y1.numpy()
-        #print(y_np)
-        #print(y1_np)
+
         if not core.is_compiled_with_cuda():
             print(">>>>>>> dropout dygraph >>>>>>>")
             self.assertTrue(np.allclose(y_np, y1_np))
@@ -119,8 +118,7 @@ class TestGeneratorSeed(unittest.TestCase):
     def test_gen_dropout_static(self):
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        gen = paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -137,19 +135,16 @@ class TestGeneratorSeed(unittest.TestCase):
             out2 = exe.run(train_program, feed={}, fetch_list=[y_1])
         out1_np = np.array(out1[0])
         out2_np = np.array(out2[0])
-        # print(out1_np)
-        # print(out2_np)
+
         if not core.is_compiled_with_cuda():
             print(">>>>>>> dropout static >>>>>>>")
             self.assertTrue(np.allclose(out1_np, out2_np))
 
     def test_generator_gaussian_random_dygraph(self):
         """Test Generator seed."""
-        gen = generator.Generator()
-
         fluid.enable_dygraph()
 
-        gen.manual_seed(12312321111)
+        gen = paddle.manual_seed(12312321111)
         x = fluid.layers.gaussian_random([10], dtype="float32")
         st1 = gen.get_state()
         x1 = fluid.layers.gaussian_random([10], dtype="float32")
@@ -168,11 +163,9 @@ class TestGeneratorSeed(unittest.TestCase):
             self.assertTrue(np.allclose(x_np, x3_np))
 
     def test_generator_gaussian_random_static(self):
-
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        gen = paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -210,7 +203,7 @@ class TestGeneratorSeed(unittest.TestCase):
 
         fluid.enable_dygraph()
 
-        gen.manual_seed(12312321111)
+        gen = paddle.manual_seed(12312321111)
         x = paddle.randint(low=10, shape=[10], dtype="int32")
         st1 = gen.get_state()
         x1 = paddle.randint(low=10, shape=[10], dtype="int32")
@@ -228,12 +221,64 @@ class TestGeneratorSeed(unittest.TestCase):
             self.assertTrue(np.allclose(x1_np, x2_np))
             self.assertTrue(np.allclose(x_np, x3_np))
 
-    def test_generator_ranint_static(self):
+    def test_generator_uniform_random_static(self):
+        fluid.disable_dygraph()
+
+        gen = paddle.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = fluid.layers.uniform_random(shape=[3, 4])
+            result_2 = fluid.layers.uniform_random(shape=[3, 4])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
 
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_generator_randint_dygraph(self):
+        """Test Generator seed."""
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(12312321111)
+        x = paddle.randint(low=1)
+        st1 = gen.get_state()
+        x1 = paddle.randint(low=1)
+        gen.set_state(st1)
+        x2 = paddle.randint(low=1)
+        gen.manual_seed(12312321111)
+        x3 = paddle.randint(low=1)
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+        if not core.is_compiled_with_cuda():
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_ranint_static(self):
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        gen = paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -267,11 +312,10 @@ class TestGeneratorSeed(unittest.TestCase):
 
     def test_generator_randperm_dygraph(self):
         """Test Generator seed."""
-        gen = generator.Generator()
 
         fluid.enable_dygraph()
 
-        gen.manual_seed(12312321111)
+        gen = paddle.manual_seed(12312321111)
         x = paddle.randperm(10)
         st1 = gen.get_state()
         x1 = paddle.randperm(10)
@@ -284,9 +328,6 @@ class TestGeneratorSeed(unittest.TestCase):
         x2_np = x2.numpy()
         x3_np = x3.numpy()
 
-        # print("## {}".format(x1_np))
-        # print("## {}".format(x2_np))
-
         if not core.is_compiled_with_cuda():
             print(">>>>>>> randperm dygraph >>>>>>>")
             self.assertTrue(np.allclose(x1_np, x2_np))
@@ -296,8 +337,7 @@ class TestGeneratorSeed(unittest.TestCase):
 
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -312,8 +352,8 @@ class TestGeneratorSeed(unittest.TestCase):
             out1 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
-            #gen.set_state(cur_state)
-            gen.manual_seed(123123143)
+
+            paddle.manual_seed(123123143)
             out2 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
@@ -331,7 +371,7 @@ class TestGeneratorSeed(unittest.TestCase):
 
     def test_generator_sampling_id_dygraph(self):
         """Test Generator seed."""
-        gen = generator.Generator()
+        gen = paddle.manual_seed(12312321111)
 
         fluid.enable_dygraph()
 
@@ -339,14 +379,17 @@ class TestGeneratorSeed(unittest.TestCase):
         x = fluid.layers.uniform_random(
             [10, 10], dtype="float32", min=0.0, max=1.0)
         y = fluid.layers.sampling_id(x)
+
         st1 = gen.get_state()
         x1 = fluid.layers.uniform_random(
             [10, 10], dtype="float32", min=0.0, max=1.0)
         y1 = fluid.layers.sampling_id(x)
+
         gen.set_state(st1)
         x2 = fluid.layers.uniform_random(
             [10, 10], dtype="float32", min=0.0, max=1.0)
         y2 = fluid.layers.sampling_id(x)
+
         gen.manual_seed(12312321111)
         x3 = fluid.layers.uniform_random(
             [10, 10], dtype="float32", min=0.0, max=1.0)
@@ -357,9 +400,6 @@ class TestGeneratorSeed(unittest.TestCase):
         x2_np = y2.numpy()
         x3_np = y3.numpy()
 
-        print("## {}".format(x1_np))
-        print("## {}".format(x2_np))
-
         if not core.is_compiled_with_cuda():
             print(">>>>>>> sampling id dygraph >>>>>>>")
             self.assertTrue(np.allclose(x1_np, x2_np))
@@ -369,8 +409,7 @@ class TestGeneratorSeed(unittest.TestCase):
 
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        paddle.manual_seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -386,8 +425,8 @@ class TestGeneratorSeed(unittest.TestCase):
             out1 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
-            #gen.set_state(cur_state)
-            gen.manual_seed(123123143)
+
+            paddle.manual_seed(123123143)
             out2 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
@@ -406,8 +445,7 @@ class TestGeneratorSeed(unittest.TestCase):
     def test_gen_TruncatedNormal_initializer(self):
         fluid.disable_dygraph()
 
-        gen = generator.Generator()
-        gen.manual_seed(123123143)
+        gen = paddle.manual_seed(123123143)
         cur_state = gen.get_state()
 
         startup_program = fluid.Program()
@@ -432,9 +470,7 @@ class TestGeneratorSeed(unittest.TestCase):
             out1 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
-            #gen.set_state(cur_state)
 
-        #gen.set_state(cur_state)    
         gen.manual_seed(123123143)
         with fluid.program_guard(train_program, startup_program):
             exe.run(startup_program)
@@ -447,11 +483,6 @@ class TestGeneratorSeed(unittest.TestCase):
         out2_res1 = np.array(out2[0])
         out2_res2 = np.array(out2[1])
 
-        print(out1_res1)
-        print(out1_res2)
-        print(out2_res1)
-        print(out2_res2)
-
         if not core.is_compiled_with_cuda():
             print(">>>>>>> sampling id static >>>>>>>")
             self.assertTrue(np.allclose(out1_res1, out2_res1))
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 58b407f8bc1f41301a068f0b85f4c4e9860a45ff..44087c5421a5ee66273ef35b935926d42dcc37ae 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -169,9 +169,10 @@ class TestRegularizer(unittest.TestCase):
         return param_sum
 
     def check_l2decay_regularizer(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
         with self.scope_prog_guard(
                 main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
@@ -188,9 +189,11 @@ class TestRegularizer(unittest.TestCase):
         return param_sum
 
     def check_l2decay(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        startup_prog.random_seed = 1
+
         with self.scope_prog_guard(
                 main_prog=main_prog, startup_prog=startup_prog):
             data = fluid.layers.data(
@@ -243,7 +246,8 @@ class TestRegularizer(unittest.TestCase):
         with fluid.dygraph.guard():
             input = fluid.dygraph.to_variable(
                 np.random.randn(3, 5).astype('float32'))
-            fluid.default_main_program().random_seed = 1
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
 
             linear1 = fluid.dygraph.Linear(
                 5, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index 0f225758ced3bf7d6fd821be09f2dbf11ff1cc6d..f7b9d4214d36a422a3ec94dc410e58c6c827ef4c 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -228,7 +228,7 @@ class TestRMSPropV2(unittest.TestCase):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
-        linear = paddle.nn.Linear(13, 5, dtype="float32")
+        linear = paddle.nn.Linear(13, 5)
         # This can be any optimizer supported by dygraph.
         adam = paddle.optimizer.RMSProp(
             learning_rate=0.01,
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index 590ef11e9cb5de7414ff8745b719e3ffb4e044d8..b5a2e84a53ef621f3be81b90d02c10d28fe18162 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -130,6 +130,11 @@ class TestSeluAPI(unittest.TestCase):
             # The input dtype must be float16, float32, float64.
             x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.selu, x_int32)
+            # The scale must be greater than 1.0
+            x_fp32 = paddle.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            self.assertRaises(ValueError, F.selu, x_fp32, -1.0)
+            # The alpha must be no less than 0
+            self.assertRaises(ValueError, F.selu, x_fp32, 1.6, -1.0)
             # support the input dtype is float16
             x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.selu(x_fp16)
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index fb3fc8735566fcf601a7cb507e3826dd92a5651e..2c87e06e893a4d6495ad81ac3dcdf375a41272fb 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -20,6 +20,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from op_test import OpTest
+import paddle
 
 
 class TestSGDOp(OpTest):
@@ -208,5 +209,46 @@ class TestSGDOpWithLargeInput(unittest.TestCase):
         result = exe.run(compiled_prog, fetch_list=[avg_cost])
 
 
+class TestSGDV2(unittest.TestCase):
+    def test_sgd_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.SGD(learning_rate=0.01,
+                                    parameters=linear.parameters(),
+                                    weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_sgd(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(ValueError, paddle.optimizer.SGD, learning_rate=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca92bc75245cebbfdfbbed80e99957d2b4f57b2a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
@@ -0,0 +1,87 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import unittest
+
+import paddle
+import paddle.distributed as dist
+from paddle.distributed.spawn import _get_subprocess_env_list
+
+from paddle.fluid import core
+from paddle.fluid.dygraph import parallel_helper
+
+# NOTE(chenweihang): Coverage CI is currently not able to count python3
+# unittest, so the unittests here covers some cases that will only be 
+# executed in the python3 sub-process. 
+
+
+class TestInitParallelEnv(unittest.TestCase):
+    def test_beckend_type_error(self):
+        with self.assertRaises(TypeError):
+            dist.init_parallel_env(backend=1)
+
+    def test_backend_value_error(self):
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env(backend="mpi")
+
+    def test_check_env_failed(self):
+        os.environ['FLAGS_selected_gpus'] = '0'
+        os.environ['PADDLE_TRAINER_ID'] = '0'
+        os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170'
+        os.environ['PADDLE_TRAINERS_NUM'] = '1'
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env()
+
+    def test_init_parallel_env_break(self):
+        os.environ['FLAGS_selected_gpus'] = '0'
+        os.environ['PADDLE_TRAINER_ID'] = '0'
+        os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170'
+        os.environ['PADDLE_TRAINERS_NUM'] = '1'
+        os.environ['PADDLE_TRAINER_ENDPOINTS'] = '127.0.0.1:6170'
+        # coverage success branch
+        dist.init_parallel_env()
+        self.assertFalse(parallel_helper._is_parallel_ctx_initialized())
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSpawnAssistMethod(unittest.TestCase):
+    def test_only_cluster_node_ips_error(self):
+        with self.assertRaises(ValueError):
+            options = dict()
+            options['cluster_node_ips'] = "127.0.0.1,127.0.0.2"
+            _get_subprocess_env_list(nprocs=1, options=options)
+
+    def test_nprocs_greater_than_device_num_error(self):
+        with self.assertRaises(RuntimeError):
+            _get_subprocess_env_list(nprocs=100, options=dict())
+
+    def test_selected_gpus_error(self):
+        with self.assertRaises(ValueError):
+            options = dict()
+            options['selected_gpus'] = "100,101"
+            _get_subprocess_env_list(nprocs=2, options=options)
+
+    def test_get_correct_env(self):
+        env_dict = _get_subprocess_env_list(nprocs=1, options=dict())[0]
+        self.assertEqual(env_dict['PADDLE_TRAINER_ID'], '0')
+        self.assertEqual(env_dict['PADDLE_TRAINERS_NUM'], '1')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index b0701a9b187f6c7cf63f43d69f482ea13e6d3fe3..09cd40d9cc59914c82cc343bb78b72fbc2b29e59 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -221,5 +221,21 @@ class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
             self.assertRaises(TypeError, my_sync_batch_norm, x2)
 
 
+class TestConvertSyncBatchNorm(unittest.TestCase):
+    def test_convert(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        with program_guard(Program(), Program()):
+            model = paddle.nn.Sequential(
+                paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
+            sync_model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            for idx, sublayer in enumerate(model.sublayers()):
+                if isinstance(sublayer, paddle.nn.BatchNorm2d):
+                    self.assertEqual(
+                        isinstance(sync_model[idx], paddle.nn.SyncBatchNorm),
+                        True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index c8d1e77134036bf7b28d4afb8bacaa44092b1053..5fea9f69a18c83be0f6af05784735ea53d0993d2 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -211,7 +211,8 @@ def ffn(src, encoder_layer, ffn_fc1_act="relu"):
 class TestTransformer(unittest.TestCase):
     def test_multi_head_attention(self):
         def multihead_attention_test_helper(self_attention, cache):
-            paddle.framework.manual_seed(2020)
+            paddle.manual_seed(2020)
+            paddle.framework.random._manual_program_seed(2020)
             # self_attention|cross_attention, cache|No cache
             with fluid.dygraph.guard(fluid.CPUPlace()):
 
@@ -275,6 +276,7 @@ class TestTransformer(unittest.TestCase):
 
         with fluid.dygraph.guard(fluid.CPUPlace()):
             paddle.framework.manual_seed(2020)
+            paddle.framework.random._manual_program_seed(2020)
 
             ffn_fc1_act = "relu"
             # 1.generate basic params
diff --git a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..49924b44441aa9ae323f0d7921d71bf58b8c2cf2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
@@ -0,0 +1,681 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.nn.functional import interpolate
+
+
+def trilinear_interp_np(input,
+                        out_d,
+                        out_h,
+                        out_w,
+                        out_size=None,
+                        actual_shape=None,
+                        align_corners=True,
+                        align_mode=0,
+                        data_layout='NCDHW'):
+    """trilinear interpolation implement in shape [N, C, D, H, W]"""
+    if data_layout == "NDHWC":
+        input = np.transpose(input, (0, 4, 1, 2, 3))  # NDHWC => NCDHW
+    if out_size is not None:
+        out_d = out_size[0]
+        out_h = out_size[1]
+        out_w = out_size[2]
+    if actual_shape is not None:
+        out_d = actual_shape[0]
+        out_h = actual_shape[1]
+        out_w = actual_shape[2]
+    batch_size, channel, in_d, in_h, in_w = input.shape
+
+    ratio_d = ratio_h = ratio_w = 0.0
+    if out_d > 1:
+        if (align_corners):
+            ratio_d = (in_d - 1.0) / (out_d - 1.0)
+        else:
+            ratio_d = 1.0 * in_d / out_d
+    if out_h > 1:
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_d, out_h, out_w))
+
+    for i in range(out_d):
+        if (align_mode == 0 and not align_corners):
+            d = int(ratio_d * (i + 0.5) - 0.5)
+        else:
+            d = int(ratio_d * i)
+
+        d = max(0, d)
+        did = 1 if d < in_d - 1 else 0
+        if (align_mode == 0 and not align_corners):
+            idx_src_d = max(ratio_d * (i + 0.5) - 0.5, 0)
+            d1lambda = idx_src_d - d
+        else:
+            d1lambda = ratio_d * i - d
+        d2lambda = 1.0 - d1lambda
+
+        for j in range(out_h):
+            if (align_mode == 0 and not align_corners):
+                h = int(ratio_h * (j + 0.5) - 0.5)
+            else:
+                h = int(ratio_h * j)
+
+            h = max(0, h)
+            hid = 1 if h < in_h - 1 else 0
+            if (align_mode == 0 and not align_corners):
+                idx_src_h = max(ratio_h * (j + 0.5) - 0.5, 0)
+                h1lambda = idx_src_h - h
+            else:
+                h1lambda = ratio_h * j - h
+            h2lambda = 1.0 - h1lambda
+
+            for k in range(out_w):
+                if (align_mode == 0 and not align_corners):
+                    w = int(ratio_w * (k + 0.5) - 0.5)
+                else:
+                    w = int(ratio_w * k)
+                w = max(0, w)
+                wid = 1 if w < in_w - 1 else 0
+                if (align_mode == 0 and not align_corners):
+                    idx_src_w = max(ratio_w * (k + 0.5) - 0.5, 0)
+                    w1lambda = idx_src_w - w
+                else:
+                    w1lambda = ratio_w * k - w
+                w2lambda = 1.0 - w1lambda
+
+                out[:, :, i, j, k] = \
+                    d2lambda * \
+                    (h2lambda * (w2lambda * input[:, :, d, h, w] + \
+                              w1lambda * input[:, :, d, h, w+wid]) + \
+                    h1lambda * (w2lambda * input[:, :, d, h+hid, w] + \
+                              w1lambda * input[:, :, d, h+hid, w+wid])) + \
+                    d1lambda * \
+                    (h2lambda * (w2lambda * input[:, :, d+did, h, w] + \
+                              w1lambda * input[:, :, d+did, h, w+wid]) + \
+                    h1lambda * (w2lambda * input[:, :, d+did, h+hid, w] + \
+                              w1lambda * input[:, :, d+did, h+hid, w+wid]))
+    if data_layout == "NDHWC":
+        out = np.transpose(out, (0, 2, 3, 4, 1))  # NCDHW => NDHWC
+
+    return out.astype(input.dtype)
+
+
+class TestTrilinearInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCDHW'
+        self.init_test_case()
+        self.op_type = "trilinear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype("float32")
+
+        if self.data_layout == "NCDHW":
+            in_d = self.input_shape[2]
+            in_h = self.input_shape[3]
+            in_w = self.input_shape[4]
+        else:
+            in_d = self.input_shape[1]
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_d = scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_d = scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[2]
+                scale_h = self.scale[1]
+                scale_d = self.scale[0]
+            out_d = int(in_d * scale_d)
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_d = self.out_d
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = trilinear_interp_np(
+            input_np, out_d, out_h, out_w, self.out_size, self.actual_shape,
+            self.align_corners, self.align_mode, self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        # c++ end treat NCDHW the same way as NCHW
+        if self.data_layout == 'NCDHW':
+            data_layout = 'NCHW'
+        else:
+            data_layout = 'NHWC'
+        self.attrs = {
+            'out_d': self.out_d,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': data_layout
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 4, 4, 4]
+        self.out_d = 2
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase1(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 1, 7, 8, 9]
+        self.out_d = 1
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase2(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 9, 6, 8]
+        self.out_d = 12
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase3(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 2, 16, 8, 4]
+        self.out_d = 32
+        self.out_h = 16
+        self.out_w = 8
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase4(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [4, 1, 7, 8, 9]
+        self.out_d = 1
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2, 2]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase5(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 3, 9, 6, 8]
+        self.out_d = 12
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11, 11]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase6(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 1, 16, 8, 4]
+        self.out_d = 8
+        self.out_h = 32
+        self.out_w = 16
+        self.scale = 0.
+        self.out_size = np.array([17, 9, 5]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpSame(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 1, 16, 8, 4]
+        self.out_d = 16
+        self.out_h = 8
+        self.out_w = 4
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpSameHW(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 1, 16, 8, 4]
+        self.out_d = 8
+        self.out_h = 8
+        self.out_w = 4
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpActualShape(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 2, 16, 8, 4]
+        self.out_d = 64
+        self.out_h = 32
+        self.out_w = 16
+        self.scale = 0.
+        self.out_size = np.array([33, 19, 7]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpDatalayout(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 4, 4, 4, 3]
+        self.out_d = 2
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = "NDHWC"
+
+
+class TestTrilinearInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "trilinear_interp_v2"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_d = scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_d = scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[2]
+                scale_h = self.scale[1]
+                scale_d = self.scale[0]
+            out_d = int(self.input_shape[2] * scale_d)
+            out_h = int(self.input_shape[3] * scale_h)
+            out_w = int(self.input_shape[4] * scale_w)
+        else:
+            out_d = self.out_d
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+                                        self.out_size, self.actual_shape,
+                                        self.align_corners, self.align_mode)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+
+        self.attrs = {
+            'out_d': self.out_d,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [1, 3, 9, 6, 8]
+        self.out_d = 13
+        self.out_h = 10
+        self.out_w = 9
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase1Uint8(TestTrilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 16, 8, 4]
+        self.out_d = 13
+        self.out_h = 7
+        self.out_w = 2
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpCase2Uint8(TestTrilinearInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [4, 1, 7, 8, 9]
+        self.out_d = 3
+        self.out_h = 5
+        self.out_w = 13
+        self.scale = 0.
+        self.out_size = np.array([6, 15, 21]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpOtherMethod1(TestTrilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestTrilinearInterpWithMethod2(TestTrilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestTrilinearInterpWithMethod3(TestTrilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = True
+        self.align_mode = 0
+
+
+class TestTrilinearInterpScale1(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 9]
+        self.out_d = 82
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 2.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpScale2(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 9]
+        self.out_d = 60
+        self.out_h = 40
+        self.out_w = 25
+        self.scale = 1.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpScale3(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 9]
+        self.out_d = 60
+        self.out_h = 40
+        self.out_w = 25
+        self.scale = 1.5
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestTrilinearInterpZero(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 5, 7, 11]
+        self.out_d = 60
+        self.out_h = 40
+        self.out_w = 25
+        self.scale = 0.2
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestTrilinearInterpOp_attr_tensor(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "trilinear_interp_v2"
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
+        }
+
+        input_np = np.random.random(self.input_shape).astype("float32")
+        self.inputs = {'X': input_np}
+
+        if self.scale_by_1Dtensor:
+            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
+        elif self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_d = scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_d = scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[2]
+                scale_h = self.scale[1]
+                scale_d = self.scale[0]
+            out_d = int(self.input_shape[2] * scale_d)
+            out_h = int(self.input_shape[3] * scale_h)
+            out_w = int(self.input_shape[4] * scale_w)
+        else:
+            out_d = self.out_d
+            out_h = self.out_h
+            out_w = self.out_w
+
+        if self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.out_size is not None:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs['out_d'] = self.out_d
+        self.attrs['out_h'] = self.out_h
+        self.attrs['out_w'] = self.out_w
+        if self.scale > 0:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+                                        self.out_size, self.actual_shape,
+                                        self.align_corners, self.align_mode)
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 4, 4, 4]
+        self.out_d = 2
+        self.out_h = 3
+        self.out_w = 3
+        self.scale = 0.
+        self.out_size = [2, 3, 3]
+        self.align_corners = True
+        self.align_mode = 1
+
+
+# out_size is a 1-D tensor
+class TestTrilinearInterp_attr_tensor_Case1(TestTrilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [3, 2, 9, 6, 8]
+        self.out_d = 32
+        self.out_h = 16
+        self.out_w = 8
+        self.scale = 0.3
+        self.out_size = [12, 4, 4]
+        self.align_corners = True
+        self.align_mode = 1
+
+
+# scale is a 1-D tensor
+class TestTrilinearInterp_attr_tensor_Case2(TestTrilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 8, 8, 4]
+        self.out_d = 16
+        self.out_h = 12
+        self.out_w = 4
+        self.scale = 0.
+        self.out_size = [16, 4, 10]
+        self.align_corners = True
+        self.align_mode = 1
+        self.shape_by_1Dtensor = True
+
+
+# scale is a 1-D tensor
+class TestTrilinearInterp_attr_tensor_Case3(TestTrilinearInterpOp_attr_tensor):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 8, 8, 4]
+        self.out_d = 16
+        self.out_h = 16
+        self.out_w = 8
+        self.scale = 2.0
+        self.out_size = None
+        self.align_corners = True
+        self.align_mode = 1
+        self.scale_by_1Dtensor = True
+
+
+class TestTrilinearInterpAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.data(name="x", shape=[2, 3, 6, 9, 4], dtype="float32")
+        y = fluid.data(name="y", shape=[2, 6, 9, 4, 3], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[3], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[3], dtype="int32")
+        scale_tensor = fluid.data(
+            name="scale_tensor", shape=[1], dtype="float32")
+
+        out1 = fluid.layers.resize_trilinear(
+            y, out_shape=[12, 18, 8], data_format='NDHWC')
+        out2 = fluid.layers.resize_trilinear(x, out_shape=[12, dim, 8])
+        out3 = fluid.layers.resize_trilinear(x, out_shape=shape_tensor)
+        out4 = fluid.layers.resize_trilinear(
+            x, out_shape=[4, 4, 8], actual_shape=actual_size)
+        out5 = fluid.layers.resize_trilinear(x, scale=scale_tensor)
+        out6 = interpolate(
+            x, scale_factor=scale_tensor, mode='trilinear', data_format="NCDHW")
+        out7 = interpolate(
+            x, size=[4, 4, 8], mode='trilinear', data_format="NCDHW")
+        out8 = interpolate(
+            x, size=shape_tensor, mode='trilinear', data_format="NCDHW")
+
+        x_data = np.random.random((2, 3, 6, 9, 4)).astype("float32")
+        dim_data = np.array([18]).astype("int32")
+        shape_data = np.array([12, 18, 8]).astype("int32")
+        actual_size_data = np.array([12, 18, 8]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x": x_data,
+                              "y": np.transpose(x_data, (0, 2, 3, 4, 1)),
+                              "dim": dim_data,
+                              "shape_tensor": shape_data,
+                              "actual_size": actual_size_data,
+                              "scale_tensor": scale_data
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5],
+                          return_numpy=True)
+
+        expect_res = trilinear_interp_np(
+            x_data, out_d=12, out_h=18, out_w=8, align_mode=1)
+        self.assertTrue(
+            np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 4, 1))))
+        for i in range(len(results) - 1):
+            self.assertTrue(np.allclose(results[i + 1], expect_res))
+
+
+class TestTrilinearInterpOpException(unittest.TestCase):
+    def test_exception(self):
+        input = fluid.data(name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+
+        def attr_data_format():
+            # for 5-D input, data_format only can be NCDHW or NDHWC
+            out = fluid.layers.resize_trilinear(
+                input, out_shape=[4, 8, 4], data_format='NHWC')
+
+        self.assertRaises(ValueError, attr_data_format)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 158462a1e6e1012b7473a2410f2c003d04ea2e40..a04aaaef0d41b9f991889586b489269b6ede5b42 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -21,6 +21,7 @@ import numpy as np
 from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
+import paddle
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
@@ -234,16 +235,16 @@ class TestUniformRandomOpSelectedRows(unittest.TestCase):
     def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
-
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[4, 784],
+            shape=[100, 784],
             min=-5.0,
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -255,19 +256,19 @@ class TestUniformRandomOpSelectedRowsWithDiagInit(
     def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
-
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[4, 784],
+            shape=[100, 784],
             min=-5.0,
             max=10.0,
             seed=10,
-            diag_num=4,
+            diag_num=100,
             diag_step=784,
             diag_val=1.0)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist_diag(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -276,6 +277,7 @@ class TestUniformRandomOpSelectedRowsWithDiagInit(
 
 class TestUniformRandomOpApi(unittest.TestCase):
     def test_api(self):
+        paddle.manual_seed(10)
         x = fluid.layers.data('x', shape=[16], dtype='float32', lod_level=1)
         y = fluid.layers.fc(x,
                             size=16,
@@ -347,12 +349,15 @@ class TestUniformRandomOp_attr_tensor_API(unittest.TestCase):
 
 class TestUniformRandomOp_API_seed(unittest.TestCase):
     def test_attr_tensor_API(self):
+        _seed = 10
+        gen = paddle.manual_seed(_seed)
+        gen._is_init_py = False
         startup_program = fluid.Program()
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
             _min = 5
             _max = 10
-            _seed = 10
+
             ret = fluid.layers.nn.uniform_random(
                 [2, 3, 2], min=_min, max=_max, seed=_seed)
             ret_2 = fluid.layers.nn.uniform_random(
@@ -386,8 +391,8 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_tensor = scope.var("Shape").get_tensor()
-        shape_tensor.set(np.array([4, 784]).astype("int64"), place)
-
+        shape_tensor.set(np.array([100, 784]).astype("int64"), place)
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             ShapeTensor="Shape",
@@ -396,7 +401,7 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -418,10 +423,10 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_1 = scope.var("shape1").get_tensor()
-        shape_1.set(np.array([4]).astype("int64"), place)
+        shape_1.set(np.array([100]).astype("int64"), place)
         shape_2 = scope.var("shape2").get_tensor()
         shape_2.set(np.array([784]).astype("int64"), place)
-
+        paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
             ShapeTensorList=["shape1", "shape2"],
@@ -430,7 +435,7 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        self.assertEqual(out.get_tensor().shape(), [100, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -455,21 +460,21 @@ class TestUniformRandomBatchSizeLikeOpError(unittest.TestCase):
 
             def test_Variable():
                 x1 = fluid.create_lod_tensor(
-                    np.zeros((4, 784)), [[1, 1, 1, 1]], fluid.CPUPlace())
+                    np.zeros((100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace())
                 fluid.layers.uniform_random_batch_size_like(x1)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_shape():
                 x1 = fluid.layers.data(
-                    name='x2', shape=[4, 784], dtype='float32')
+                    name='x2', shape=[100, 784], dtype='float32')
                 fluid.layers.uniform_random_batch_size_like(x1, shape="shape")
 
             self.assertRaises(TypeError, test_shape)
 
             def test_dtype():
                 x2 = fluid.layers.data(
-                    name='x2', shape=[4, 784], dtype='float32')
+                    name='x2', shape=[100, 784], dtype='float32')
                 fluid.layers.uniform_random_batch_size_like(x2, 'int32')
 
             self.assertRaises(TypeError, test_dtype)
@@ -495,20 +500,20 @@ class TestUniformOpError(unittest.TestCase):
 
             def test_Variable():
                 x1 = fluid.create_lod_tensor(
-                    np.zeros((4, 784)), [[1, 1, 1, 1]], fluid.CPUPlace())
+                    np.zeros((100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace())
                 paddle.tensor.random.uniform(x1)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_Variable2():
-                x1 = np.zeros((4, 784))
+                x1 = np.zeros((100, 784))
                 paddle.tensor.random.uniform(x1)
 
             self.assertRaises(TypeError, test_Variable2)
 
             def test_dtype():
                 x2 = fluid.layers.data(
-                    name='x2', shape=[4, 784], dtype='float32')
+                    name='x2', shape=[100, 784], dtype='float32')
                 paddle.tensor.random.uniform(x2, 'int32')
 
             self.assertRaises(TypeError, test_dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_unique.py b/python/paddle/fluid/tests/unittests/test_unique.py
index ae36f8a9861560b51a78617d1b66aa7743b87580..a2c60d870e5e13fd161945fe0abe9b3ab82cc82c 100644
--- a/python/paddle/fluid/tests/unittests/test_unique.py
+++ b/python/paddle/fluid/tests/unittests/test_unique.py
@@ -233,6 +233,24 @@ class TestUniqueAPI(unittest.TestCase):
         self.assertTrue((counts.numpy() == np_counts).all(), True)
         paddle.enable_static()
 
+    def test_dygraph_attr_dtype(self):
+        paddle.disable_static()
+        x_data = x_data = np.random.randint(0, 10, (120))
+        x = paddle.to_tensor(x_data)
+        out, indices, inverse, counts = paddle.unique(
+            x,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            dtype="int32")
+        expected_out, np_indices, np_inverse, np_counts = np.unique(
+            x_data, return_index=True, return_inverse=True, return_counts=True)
+        self.assertTrue((out.numpy() == expected_out).all(), True)
+        self.assertTrue((indices.numpy() == np_indices).all(), True)
+        self.assertTrue((inverse.numpy() == np_inverse).all(), True)
+        self.assertTrue((counts.numpy() == np_counts).all(), True)
+        paddle.enable_static()
+
     def test_static_graph(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
@@ -282,6 +300,9 @@ class TestUniqueError(unittest.TestCase):
         def test_axis():
             result = paddle.unique(x, axis='12')
 
+        def test_dtype():
+            result = paddle.unique(x, dtype='float64')
+
         self.assertRaises(TypeError, test_axis)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 80b94704c388824901312b5d577cb5cfd0d0c75b..c8383bb950d3ed7b2cdfafa185b0ad156bf7c7bf 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -32,6 +32,30 @@ class TestVarBase(unittest.TestCase):
     def test_to_tensor(self):
         def _test_place(place):
             with fluid.dygraph.guard():
+                paddle.set_default_dtype('float32')
+                x = paddle.to_tensor(1, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1]))
+                self.assertNotEqual(x.dtype, core.VarDesc.VarType.FP32)
+
+                x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
+                self.assertTrue(
+                    np.array_equal(x.numpy(), np.array([1.2]).astype(
+                        'float32')))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
+
+                x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
+                self.assertEqual(x.dtype, 'complex64')
+
+                paddle.set_default_dtype('float64')
+                x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1.2]))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP64)
+
+                x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
+                self.assertEqual(x.dtype, 'complex128')
+
                 x = paddle.to_tensor(
                     1, dtype='float32', place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1.]))
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index 207ff66a0f877598989e47a8632aa783b53bcc67..ee01bfb21f8206133aa55e8962b4bcc46233085e 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid.layers as layers
 from paddle.fluid.executor import Executor
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
index 4629089e39c9489725340df2172c53ed0661708f..581656f6cd421b12cb4c373bd6d46648704f0c1a 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
@@ -73,6 +73,7 @@ NO_FP64_CHECK_GRAD_OP_LIST = [
     'mish', \
     'transpose2', \
     'trilinear_interp', \
+    'trilinear_interp_v2', \
     'var_conv_2d', \
     'warpctc', \
     'bilateral_slice'
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
index 5300ab935a3405f9f76c08a7f2ece8bad33367ac..47d62999c92d12ab4305272f60c1453cda211b09 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@@ -15,6 +15,7 @@
 NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
     'affine_channel', \
     'bilinear_interp', \
+    'bilinear_interp_v2',\
     'bilinear_tensor_product', \
     'conv2d', \
     'conv3d', \
@@ -45,4 +46,6 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
     'cudnn_lstm'
 ]
 
-NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp']
+NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp',\
+                                                'bilinear_interp_v2'
+                                                ]
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index f01dc01973a603a0b6ea08358f73237c68924c78..b2975283fbef010029b935b9b209411f09bdb5fd 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -20,8 +20,8 @@ __all__ = [
 ]
 
 __all__ += [
-    'BackwardStrategy', 'grad', 'LayerList', 'load', 'save', 'prepare_context',
-    'to_variable', 'no_grad', 'ParallelEnv', 'DataParallel'
+    'grad', 'LayerList', 'load', 'save', 'prepare_context', 'to_variable',
+    'no_grad', 'ParallelEnv', 'DataParallel'
 ]
 
 __all__ += [
@@ -50,8 +50,6 @@ from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
-from ..fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
-from ..fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
@@ -61,5 +59,3 @@ from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay  #DEFINE_AL
 from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay  #DEFINE_ALIAS
 from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay  #DEFINE_ALIAS
 from ..fluid.dygraph.learning_rate_scheduler import CosineDecay  #DEFINE_ALIAS
-
-BackwardStrategy = core.BackwardStrategy
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 1bb13294805efca319546b432354ce5ba6b1e868..2555d24464112ed8446d863dc8e65cfa37680b36 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -14,28 +14,50 @@
 
 # TODO: define random api
 import paddle.fluid as fluid
+from paddle.fluid import core
 
 __all__ = ['manual_seed']
 
 
 def manual_seed(seed):
     """
-	:alias_main: paddle.manual_seed
-	:alias: paddle.manual_seed,paddle.framework.random.manual_seed
 
-    Set global manual seed for program
+    Sets the seed for global default generator, which manages the random number generation.
 
     Args:
-        manual_seed(int): random seed for program
+        seed(int): The random seed to set. It is recommend to set a large int number.
 
     Returns:
-        None.
+        Generator: The global default generator object.
 
     Examples:
         .. code-block:: python
 
-            from paddle.framework import manual_seed
-            manual_seed(102)
+            import paddle
+            gen = paddle.manual_seed(102)
+
+    """
+    #TODO(zhiqiu): 1. remove program.random_seed when all random-related op upgrade
+    # 2. support gpu generator by global device 
+
+    seed = int(seed)
+
+    core.default_cpu_generator()._is_init_py = True
+    return core.default_cpu_generator().manual_seed(seed)
+
+
+def _manual_program_seed(seed):
+    """
+    Sets global seed for generating random numbers.
+  
+    NOTE(zhiqiu): This is the original implemention of manual_seed. Keeps it temporally 
+    since CUDA generator is not developed, so we need it in the unittest.
+
+    Args:
+        seed(int): The random seed to set. It is recommend to set a large int number.
+    
+    Returns:
+        None
     """
     fluid.default_main_program().random_seed = seed
     fluid.default_startup_program().random_seed = seed
diff --git a/python/paddle/incubate/hapi/model.py b/python/paddle/incubate/hapi/model.py
index 977f9233a954746486afee114b2e87a25096ed38..b52354d4ccf4671b0d372bae63a1befbe383e053 100644
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/incubate/hapi/model.py
@@ -891,33 +891,31 @@ class Model(object):
 
                 class Mnist(paddle.nn.Layer):
                     def __init__(self):
-                        super(MyNet, self).__init__()
-                        self._fc = Linear(784, 1, act='softmax')
+                        super(Mnist, self).__init__()
+                        self._fc = Linear(784, 10, act='softmax')
 
-                  @paddle.jit.to_static # If save for inference in dygraph, need this
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
+                    # If save for inference in dygraph, need this
+                    @paddle.jit.to_static
+                    def forward(self, x):
+                        y = self._fc(x)
+                        return y
 
-                dynamic = True # False
+                dynamic = True  # False
                 device = hapi.set_device('cpu')
                 # if use static graph, do not set
                 paddle.disable_static(device) if dynamic else None
-
                 # inputs and labels are not required for dynamic graph.
                 input = hapi.Input([None, 784], 'float32', 'x')
                 label = hapi.Input([None, 1], 'int64', 'label')
-
                 model = hapi.Model(Mnist(), input, label)
                 optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                    parameter_list=model.parameters())
-                model.prepare(optim,
-                                paddle.nn.CrossEntropyLoss(),
-                                hapi.metrics.Accuracy())
+                                            parameter_list=model.parameters())
+                model.prepare(optim, paddle.nn.CrossEntropyLoss())
                 mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
                 model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
-                model.save('checkpoint/test') # save for training
-                model.save('inference_model', False) # save for inference
+                model.save('checkpoint/test')  # save for training
+                model.save('inference_model', False)  # save for inference
+
         """
 
         if ParallelEnv().local_rank == 0:
@@ -1534,47 +1532,6 @@ class Model(object):
 
         Returns:
             list: The fetch variables' name list
-
-        Examples:
-        .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.static import InputSpec
-
-            import paddle.incubate.hapi as hapi
-            from paddle.nn import Linear
-            from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
-
-            class Mnist(Layer):
-                def __init__(self, classifier_act=None):
-                    super(Mnist, self).__init__()
-
-                    self.fc = Linear(input_dim=784, output_dim=10, act="softmax")
-
-                @paddle.jit.to_static # In static mode, you need to delete this.
-                def forward(self, inputs):
-                    outputs = self.fc(inputs)
-                    return outputs
-
-            dynamic = True # False
-            device = hapi.set_device('gpu')
-
-            # if use static graph, do not set
-            paddle.disable_static(device) if dynamic else None
-
-            # inputs and labels are not required for dynamic graph.
-            input = InputSpec([None, 784], 'float32', 'x')
-            label = InputSpec([None, 1], 'int64', 'label')
-
-            model = hapi.Model(Mnist(), input, label)
-            optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                parameter_list=model.parameters())
-            model.prepare(optim,
-                            paddle.nn.CrossEntropyLoss(),
-                            hapi.metrics.Accuracy())
-            mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
-            model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
-            model.save_inference_model('inference_model')
         """
 
         def get_inout_spec(all_vars, return_name=False):
@@ -1592,68 +1549,66 @@ class Model(object):
         #    the inputs of the model in running.
         # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
         if fluid.in_dygraph_mode():
-            layer = self.network
-            fluid.disable_dygraph()
-
-            # 1. input check
-            prog_translator = ProgramTranslator()
-            if not prog_translator.enable_declarative:
-                raise RuntimeError(
-                    "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
-                )
-            if not isinstance(layer, Layer):
-                raise TypeError(
-                    "The input layer should be 'Layer', but received layer type is %s."
-                    % type(layer))
-
-            # 2. get program of declarative Layer.forward
-            prog_cache = prog_translator.get_program_cache()
-            # make dummy args & kwargs, to get excepted FunctionSpec
-            layer_func = FunctionSpec(type(layer).forward, [layer], {})
-            concrete_program, _ = prog_cache.get_program(layer_func)
-
-            # NOTE: we maintain the mapping of variable name to
-            # structured name, the buffer variable (non-persistable)
-            # saved to inference program may not need by dygraph Layer,
-            # we only record the state_dict variable's structured name
-            state_names_dict = dict()
-            for structured_name, var in layer.state_dict().items():
-                state_names_dict[var.name] = structured_name
-
-            # 3. share parameters from Layer to scope & record var info
-            scope = core.Scope()
-            extra_var_info = dict()
-            for param_or_buffer in concrete_program.parameters:
-                # share to scope
-                param_or_buffer_tensor = scope.var(
-                    param_or_buffer.name).get_tensor()
-                src_tensor = param_or_buffer.value().get_tensor()
-                param_or_buffer_tensor._share_data_with(src_tensor)
-                # record var info
-                extra_info_dict = dict()
-                if param_or_buffer.name in state_names_dict:
-                    extra_info_dict['structured_name'] = state_names_dict[
-                        param_or_buffer.name]
-                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
-                if isinstance(param_or_buffer, ParamBase):
-                    extra_info_dict['trainable'] = param_or_buffer.trainable
-                extra_var_info[param_or_buffer.name] = extra_info_dict
-
-            # 4. build input & output spec
-            input_var_names = get_inout_spec(concrete_program.inputs, True)
-            output_vars = get_inout_spec(concrete_program.outputs)
-
-            # 5. save inference model
-            with scope_guard(scope):
-                return fluid.io.save_inference_model(
-                    dirname=save_dir,
-                    feeded_var_names=input_var_names,
-                    target_vars=output_vars,
-                    executor=Executor(_current_expected_place()),
-                    main_program=concrete_program.main_program.clone(),
-                    model_filename=model_filename,
-                    params_filename=params_filename,
-                    program_only=model_only)
+            with fluid.framework._dygraph_guard(None):
+                layer = self.network
+
+                # 1. input check
+                prog_translator = ProgramTranslator()
+                if not prog_translator.enable_declarative:
+                    raise RuntimeError(
+                        "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
+                    )
+                if not isinstance(layer, Layer):
+                    raise TypeError(
+                        "The input layer should be 'Layer', but received layer type is %s."
+                        % type(layer))
+
+                # 2. get program of declarative Layer.forward
+                concrete_program = layer.forward.concrete_program
+
+                # NOTE: we maintain the mapping of variable name to
+                # structured name, the buffer variable (non-persistable)
+                # saved to inference program may not need by dygraph Layer,
+                # we only record the state_dict variable's structured name
+                state_names_dict = dict()
+                for structured_name, var in layer.state_dict().items():
+                    state_names_dict[var.name] = structured_name
+
+                # 3. share parameters from Layer to scope & record var info
+                scope = core.Scope()
+                extra_var_info = dict()
+                for param_or_buffer in concrete_program.parameters:
+                    # share to scope
+                    param_or_buffer_tensor = scope.var(
+                        param_or_buffer.name).get_tensor()
+                    src_tensor = param_or_buffer.value().get_tensor()
+                    param_or_buffer_tensor._share_data_with(src_tensor)
+                    # record var info
+                    extra_info_dict = dict()
+                    if param_or_buffer.name in state_names_dict:
+                        extra_info_dict['structured_name'] = state_names_dict[
+                            param_or_buffer.name]
+                    extra_info_dict[
+                        'stop_gradient'] = param_or_buffer.stop_gradient
+                    if isinstance(param_or_buffer, ParamBase):
+                        extra_info_dict['trainable'] = param_or_buffer.trainable
+                    extra_var_info[param_or_buffer.name] = extra_info_dict
+
+                # 4. build input & output spec
+                input_var_names = get_inout_spec(concrete_program.inputs, True)
+                output_vars = get_inout_spec(concrete_program.outputs)
+
+                # 5. save inference model
+                with scope_guard(scope):
+                    return fluid.io.save_inference_model(
+                        dirname=save_dir,
+                        feeded_var_names=input_var_names,
+                        target_vars=output_vars,
+                        executor=Executor(_current_expected_place()),
+                        main_program=concrete_program.main_program.clone(),
+                        model_filename=model_filename,
+                        params_filename=params_filename,
+                        program_only=model_only)
 
         else:
             prog = self._adapter._progs.get('test', None)
@@ -1742,12 +1697,13 @@ class Model(object):
         out_specs = []
 
         if specs is None:
-            # If not specific specs of `Input`, using argument names of `forward` function
-            # to generate `Input`.
+            # Note(Aurelius84): If not specific specs of `Input`, using argument names of `forward` function
+            # to generate `Input`. But how can we know the actual shape of each input tensor?
             if is_input:
                 out_specs = [
-                    Input(name=n) for n in extract_args(self.network.forward)
-                    if n != 'self'
+                    Input(
+                        name=n, shape=[None])
+                    for n in extract_args(self.network.forward) if n != 'self'
                 ]
             else:
                 out_specs = to_list(specs)
diff --git a/python/paddle/incubate/hapi/tests/test_model.py b/python/paddle/incubate/hapi/tests/test_model.py
index 96c432e1bfd8f3620c705be62c4e9d90c2709fa5..7fc471aa1e2eeb80ae81d4a32b09eeff74193e6f 100644
--- a/python/paddle/incubate/hapi/tests/test_model.py
+++ b/python/paddle/incubate/hapi/tests/test_model.py
@@ -22,8 +22,9 @@ import numpy as np
 import shutil
 import tempfile
 
+import paddle
 from paddle import fluid
-from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
 from paddle.fluid.dygraph.base import to_variable
 
 import paddle.incubate.hapi as hapi
@@ -53,10 +54,8 @@ class LeNetDygraph(fluid.dygraph.Layer):
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -83,10 +82,8 @@ class LeNetDeclarative(fluid.dygraph.Layer):
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
 
     @declarative
     def forward(self, inputs):
@@ -174,8 +171,8 @@ class TestModel(unittest.TestCase):
             cls.test_dataset, places=cls.device, batch_size=64)
 
         seed = 333
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
 
         dy_lenet = LeNetDygraph()
         cls.init_param = dy_lenet.state_dict()
@@ -226,8 +223,8 @@ class TestModel(unittest.TestCase):
     def fit(self, dynamic, num_replicas=None, rank=None):
         fluid.enable_dygraph(self.device) if dynamic else None
         seed = 333
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
 
         net = LeNet(classifier_activation=None)
         optim_new = fluid.optimizer.Adam(
@@ -320,17 +317,19 @@ class TestModel(unittest.TestCase):
 class MyModel(fluid.dygraph.Layer):
     def __init__(self, classifier_activation='softmax'):
         super(MyModel, self).__init__()
-        self._fc = Linear(20, 10, act=classifier_activation)
+        self._fc = Linear(20, 10)
+        self._act = Softmax()  #Todo: accept any activation
 
     def forward(self, x):
         y = self._fc(x)
+        y = self._act(y)
         return y
 
 
 class TestModelFunction(unittest.TestCase):
     def set_seed(self, seed=1024):
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
 
     def test_train_batch(self, dynamic=True):
         dim = 20
diff --git a/python/paddle/incubate/hapi/tests/test_text.py b/python/paddle/incubate/hapi/tests/test_text.py
index bdc637997b0cbd8389fdfab9f71597c62b0e21a3..c4fef0d749ce788e50d8cffdf9b7041e33d078af 100644
--- a/python/paddle/incubate/hapi/tests/test_text.py
+++ b/python/paddle/incubate/hapi/tests/test_text.py
@@ -20,6 +20,7 @@ import random
 
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Embedding, Linear, Layer
 from paddle.fluid.layers import BeamSearchDecoder
@@ -87,15 +88,18 @@ class ModuleApiTest(unittest.TestCase):
             fluid.enable_dygraph(place)
         else:
             fluid.disable_dygraph()
-        fluid.default_main_program().random_seed = self._random_seed
-        fluid.default_startup_program().random_seed = self._random_seed
-        layer = self.model_cls(**self.attrs) if isinstance(
-            self.attrs, dict) else self.model_cls(*self.attrs)
-        model = Model(layer, inputs=self.make_inputs())
-        model.prepare()
-        if self.param_states:
-            model.load(self.param_states, optim_state=None)
-        return model.test_batch(self.inputs)
+        gen = paddle.manual_seed(self._random_seed)
+        gen._is_init_py = False
+        paddle.framework.random._manual_program_seed(self._random_seed)
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            layer = self.model_cls(**self.attrs) if isinstance(
+                self.attrs, dict) else self.model_cls(*self.attrs)
+            model = Model(layer, inputs=self.make_inputs())
+            model.prepare()
+            if self.param_states:
+                model.load(self.param_states, optim_state=None)
+            return model.test_batch(self.inputs)
 
     def check_output_with_place(self, place, mode="test"):
         dygraph_output = self._calc_output(place, mode, dygraph=True)
@@ -129,12 +133,9 @@ class TestBasicLSTM(ModuleApiTest):
 
     @staticmethod
     def model_init(model, input_size, hidden_size):
-        model.lstm = RNN(
-            BasicLSTMCell(
-                input_size,
-                hidden_size,
-                param_attr=fluid.ParamAttr(name="lstm_weight"),
-                bias_attr=fluid.ParamAttr(name="lstm_bias")))
+        model.lstm = RNN(BasicLSTMCell(
+            input_size,
+            hidden_size, ))
 
     @staticmethod
     def model_forward(model, inputs):
diff --git a/python/paddle/incubate/hapi/tests/test_transforms.py b/python/paddle/incubate/hapi/tests/test_transforms.py
index 087f2d1615fc916d23464c1c4387b8f6befe6ac8..84208fda1e947f343de52a0a3c8de68322672013 100644
--- a/python/paddle/incubate/hapi/tests/test_transforms.py
+++ b/python/paddle/incubate/hapi/tests/test_transforms.py
@@ -64,6 +64,11 @@ class TestTransforms(unittest.TestCase):
 
         self.do_transform(trans)
 
+    def test_normalize(self):
+        normalize = transforms.Normalize(mean=0.5, std=0.5)
+        trans = transforms.Compose([transforms.Permute(mode='CHW'), normalize])
+        self.do_transform(trans)
+
     def test_trans_resize(self):
         trans = transforms.Compose([
             transforms.Resize(300, [0, 1]),
@@ -165,7 +170,7 @@ class TestTransforms(unittest.TestCase):
         fake_img = np.random.rand(500, 400, 3).astype('float32')
         fake_img_gray = trans_gray(fake_img)
 
-        np.testing.assert_equal(len(fake_img_gray.shape), 2)
+        np.testing.assert_equal(len(fake_img_gray.shape), 3)
         np.testing.assert_equal(fake_img_gray.shape[0], 500)
         np.testing.assert_equal(fake_img_gray.shape[1], 400)
 
diff --git a/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py b/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
index 26ec53014b1c3b113a0e1ee82f3b9edfe9f48a3f..6df9b31217aae78c43de8d29956a8b2def99055b 100644
--- a/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
+++ b/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
@@ -22,7 +22,7 @@ import shutil
 import tempfile
 
 from paddle import fluid
-from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
 
 from paddle.incubate.hapi.utils import uncombined_weight_to_state_dict
 
@@ -43,10 +43,8 @@ class LeNetDygraph(fluid.dygraph.Layer):
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
diff --git a/python/paddle/incubate/hapi/vision/models/lenet.py b/python/paddle/incubate/hapi/vision/models/lenet.py
index dc7b094de0f26e04b9f07d011d3ce492950df269..169f70562f6edfe1773a1c8d75004c25831cedcb 100644
--- a/python/paddle/incubate/hapi/vision/models/lenet.py
+++ b/python/paddle/incubate/hapi/vision/models/lenet.py
@@ -13,7 +13,7 @@
 #limitations under the License.
 
 import paddle.fluid as fluid
-from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
 
 __all__ = ['LeNet']
 
@@ -50,10 +50,8 @@ class LeNet(fluid.dygraph.Layer):
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120),
-                Linear(120, 84),
-                Linear(
-                    84, 10, act=classifier_activation))
+                Linear(400, 120), Linear(120, 84), Linear(84, 10),
+                Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
diff --git a/python/paddle/incubate/hapi/vision/models/vgg.py b/python/paddle/incubate/hapi/vision/models/vgg.py
index 30f6e120b2502113045b3583686360f4ed2c32ac..4352a768eb7206ca30acead580a64a7d04b7701b 100644
--- a/python/paddle/incubate/hapi/vision/models/vgg.py
+++ b/python/paddle/incubate/hapi/vision/models/vgg.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
-from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU
+from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU, Softmax
 from paddle.fluid.dygraph.container import Sequential
 
 from ...download import get_weights_path_from_url
@@ -37,7 +37,8 @@ class Classifier(fluid.dygraph.Layer):
         super(Classifier, self).__init__()
         self.linear1 = Linear(512 * 7 * 7, 4096)
         self.linear2 = Linear(4096, 4096)
-        self.linear3 = Linear(4096, num_classes, act=classifier_activation)
+        self.linear3 = Linear(4096, num_classes)
+        self.act = Softmax()  #Todo: accept any activation
 
     def forward(self, x):
         x = self.linear1(x)
@@ -46,7 +47,8 @@ class Classifier(fluid.dygraph.Layer):
         x = self.linear2(x)
         x = fluid.layers.relu(x)
         x = fluid.layers.dropout(x, 0.5)
-        out = self.linear3(x)
+        x = self.linear3(x)
+        out = self.act(x)
         return out
 
 
diff --git a/python/paddle/incubate/hapi/vision/transforms/functional.py b/python/paddle/incubate/hapi/vision/transforms/functional.py
index f76aa6be8b4ddaf8b57278b32cf11d145350d772..b118ee3fc7553dc7d02028ae273be33166829635 100644
--- a/python/paddle/incubate/hapi/vision/transforms/functional.py
+++ b/python/paddle/incubate/hapi/vision/transforms/functional.py
@@ -16,6 +16,7 @@ import sys
 import collections
 import random
 import math
+import functools
 
 import cv2
 import numbers
@@ -31,6 +32,23 @@ else:
 __all__ = ['flip', 'resize', 'pad', 'rotate', 'to_grayscale']
 
 
+def keepdims(func):
+    """Keep the dimension of input images unchanged"""
+
+    @functools.wraps(func)
+    def wrapper(image, *args, **kwargs):
+        if len(image.shape) != 3:
+            raise ValueError("Expect image have 3 dims, but got {} dims".format(
+                len(image.shape)))
+        ret = func(image, *args, **kwargs)
+        if len(ret.shape) == 2:
+            ret = ret[:, :, np.newaxis]
+        return ret
+
+    return wrapper
+
+
+@keepdims
 def flip(image, code):
     """
     Accordding to the code (the type of flip), flip the input image
@@ -62,6 +80,7 @@ def flip(image, code):
     return cv2.flip(image, flipCode=code)
 
 
+@keepdims
 def resize(img, size, interpolation=cv2.INTER_LINEAR):
     """
     resize the input data to given size
@@ -103,6 +122,7 @@ def resize(img, size, interpolation=cv2.INTER_LINEAR):
         return cv2.resize(img, size[::-1], interpolation=interpolation)
 
 
+@keepdims
 def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
     """Pads the given CV Image on all sides with speficified padding mode and fill value.
 
@@ -193,6 +213,7 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
     return img
 
 
+@keepdims
 def rotate(img,
            angle,
            interpolation=cv2.INTER_LINEAR,
@@ -266,6 +287,7 @@ def rotate(img,
     return dst.astype(dtype)
 
 
+@keepdims
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
diff --git a/python/paddle/incubate/hapi/vision/transforms/transforms.py b/python/paddle/incubate/hapi/vision/transforms/transforms.py
index 90c6e279959b2133e5cc1184b981723b34c0b750..d46faa0685aa13790be217e0c99ab407790dd2ca 100644
--- a/python/paddle/incubate/hapi/vision/transforms/transforms.py
+++ b/python/paddle/incubate/hapi/vision/transforms/transforms.py
@@ -505,7 +505,7 @@ class Normalize(object):
             mean = [mean, mean, mean]
 
         if isinstance(std, numbers.Number):
-            mean = [std, std, std]
+            std = [std, std, std]
 
         self.mean = np.array(mean, dtype=np.float32).reshape(len(mean), 1, 1)
         self.std = np.array(std, dtype=np.float32).reshape(len(std), 1, 1)
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 89bbd5916578b6e3169452d85e581c438f2bbb47..78f792d6a5a6698034912297f5d5a23db0b35201 100644
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -16,6 +16,7 @@
 __all__ = [
     'Dataset',
     'IterableDataset',
+    'TensorDataset',
     'BatchSampler',
     #            'Transform',
     'DataLoader',
@@ -42,7 +43,7 @@ __all__ = [
 
 from ..fluid.io import DataLoader
 from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
-        Sampler, SequenceSampler, RandomSampler
+        TensorDataset, Sampler, SequenceSampler, RandomSampler
 from ..fluid.io import load, save, load_program_state, set_program_state, \
         load_inference_model, save_inference_model, batch
 from ..reader import shuffle, buffered, cache, chain, firstn, compose, map_readers, xmap_readers
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
index 47369e3ff9cd87539f9e96708ff981dc67d06420..03299a3bb9823d31c40ae4faab601ed89570c71e 100644
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
@@ -16,11 +16,13 @@ from ..fluid.dygraph.jit import save  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import load  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import TracedLayer  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import set_code_level  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import set_verbosity  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import declarative as to_static  #DEFINE_ALIAS
 from ..fluid.dygraph import ProgramTranslator  #DEFINE_ALIAS
 from ..fluid.dygraph.io import TranslatedLayer  #DEFINE_ALIAS
 
 __all__ = [
     'save', 'load', 'SaveLoadConfig', 'TracedLayer', 'to_static',
-    'ProgramTranslator', 'TranslatedLayer'
+    'ProgramTranslator', 'TranslatedLayer', 'set_code_level', 'set_verbosity'
 ]
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 07b3f0d284dcd28d4967131ab85bb2ca3cd1d6da..5cc9f6d32f9d7ef3dafd73badd0ea88bed372968 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -18,6 +18,7 @@
 from .layer import norm
 from .functional import extension
 from .layer import common
+from .layer import rnn
 from .utils import weight_norm_hook
 
 from . import initializer
@@ -26,6 +27,7 @@ __all__ = []
 __all__ += norm.__all__
 __all__ += extension.__all__
 __all__ += common.__all__
+__all__ += rnn.__all__
 __all__ += weight_norm_hook.__all__
 
 # TODO: define alias in nn directory
@@ -88,13 +90,27 @@ from .layer.common import Embedding  #DEFINE_ALIAS
 from .layer.common import Linear  #DEFINE_ALIAS
 from .layer.common import Flatten  #DEFINE_ALIAS
 from .layer.common import UpSample  #DEFINE_ALIAS
+from .layer.common import UpsamplingNearest2d  #DEFINE_ALIAS
+from .layer.common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .layer.common import Bilinear  #DEFINE_ALIAS
 from .layer.common import Dropout  #DEFINE_ALIAS
 from .layer.common import Dropout2D  #DEFINE_ALIAS
 from .layer.common import Dropout3D  #DEFINE_ALIAS
 from .layer.common import AlphaDropout  #DEFINE_ALIAS
+
+from .layer.pooling import AvgPool1d  #DEFINE_ALIAS
+from .layer.pooling import AvgPool2d  #DEFINE_ALIAS
+from .layer.pooling import AvgPool3d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool1d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool2d  #DEFINE_ALIAS
+from .layer.pooling import MaxPool3d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
 from .layer.pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
 from .layer.pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+
+from .layer.pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveMaxPool2d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveMaxPool3d  #DEFINE_ALIAS
 from .layer.conv import Conv1d  #DEFINE_ALIAS
 from .layer.conv import Conv2d  #DEFINE_ALIAS
 from .layer.conv import Conv3d  #DEFINE_ALIAS
@@ -111,6 +127,7 @@ from .layer.extension import RowConv  #DEFINE_ALIAS
 # from .layer.learning_rate import NoamDecay        #DEFINE_ALIAS
 # from .layer.learning_rate import PiecewiseDecay        #DEFINE_ALIAS
 # from .layer.learning_rate import PolynomialDecay        #DEFINE_ALIAS
+from .layer.common import Linear
 # from .layer.loss import NCELoss        #DEFINE_ALIAS
 from .layer.loss import BCEWithLogitsLoss  #DEFINE_ALIAS
 from .layer.loss import CrossEntropyLoss  #DEFINE_ALIAS
@@ -134,6 +151,7 @@ from .layer.norm import InstanceNorm3d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
+from .layer.rnn import *
 # from .layer.rnn import RNNCell        #DEFINE_ALIAS
 # from .layer.rnn import GRUCell        #DEFINE_ALIAS
 # from .layer.rnn import LSTMCell        #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 97a4d5432bdc24912a851741328516e9269a64c2..3c0aa9c5c99e545b657559c30fcde46a69781231 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -75,6 +75,7 @@ from .common import interpolate  #DEFINE_ALIAS
 from .common import bilinear  #DEFINE_ALIAS
 from .conv import conv1d  #DEFINE_ALIAS
 from .conv import conv_transpose1d  #DEFINE_ALIAS
+from .common import linear  #DEFINE_ALIAS
 from .conv import conv2d  #DEFINE_ALIAS
 from .conv import conv_transpose2d  #DEFINE_ALIAS
 from .conv import conv3d  #DEFINE_ALIAS
@@ -169,20 +170,28 @@ from .norm import layer_norm  #DEFINE_ALIAS
 from .norm import lrn  #DEFINE_ALIAS
 from .norm import normalize  #DEFINE_ALIAS
 # from .norm import spectral_norm        #DEFINE_ALIAS
-from .pooling import max_pool1d  #DEFINE_ALIAS
-from .pooling import avg_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
 from .pooling import pool2d  #DEFINE_ALIAS
 from .pooling import pool3d  #DEFINE_ALIAS
+from .pooling import avg_pool1d  #DEFINE_ALIAS
 from .pooling import adaptive_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_pool3d  #DEFINE_ALIAS
 from .pooling import avg_pool2d  #DEFINE_ALIAS
-from .pooling import max_pool2d  #DEFINE_ALIAS
 from .pooling import avg_pool3d  #DEFINE_ALIAS
+from .pooling import max_pool1d  #DEFINE_ALIAS
+from .pooling import max_pool2d  #DEFINE_ALIAS
 from .pooling import max_pool3d  #DEFINE_ALIAS
+
+from .pooling import adaptive_pool2d  #DEFINE_ALIAS
+from .pooling import adaptive_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool2d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
 from .pooling import adaptive_avg_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_avg_pool3d  #DEFINE_ALIAS
+
+from .rnn import rnn  #DEFINE_ALIAS
+from .rnn import birnn  #DEFINE_ALIAS
 # from .rnn import gru_unit        #DEFINE_ALIAS
 # from .rnn import lstm        #DEFINE_ALIAS
 # from .rnn import lstm_unit        #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 2e399db2a9aba4edce5ebd42df83df16937a80d9..ffedb027330bda94db86dc0943a5c4a7281f254f 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -168,13 +168,13 @@ def hardshrink(x, threshold=0.5, name=None):
     .. math::
 
         hardshrink(x)=
-            \left\{
-            \begin{aligned}
-            &x, & & if \ x > threshold \\
-            &x, & & if \ x < -threshold \\
-            &0, & & if \ others
-            \end{aligned}
-            \right.
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x > threshold \\\\
+            &x, & & if \\ x < -threshold \\\\
+            &0, & & if \\ others
+            \\end{aligned}
+            \\right.
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -391,14 +391,14 @@ def leaky_relu(x, negative_slope=0.01, name=None):
     """
     leaky_relu activation
 
-    .. math:
-        leaky_relu(x)=
-            \left\{
-            \begin{aligned}
-            &x, & & if \ x >= 0 \\
-            &negative\_slope * x, & & otherwise \\
-            \end{aligned}
-            \right. \\
+    .. math::
+        leaky\\_relu(x)=
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x >= 0 \\\\
+            &negative\_slope * x, & & otherwise \\\\
+            \\end{aligned}
+            \\right. \\\\
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -652,8 +652,8 @@ def selu(x,
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        scale (float, optional): The value of scale for selu. Default is 1.0507009873554804934193349852946
-        alpha (float, optional): The value of alpha for selu. Default is 1.6732632423543772848170429916717
+        scale (float, optional): The value of scale(must be greater than 1.0) for selu. Default is 1.0507009873554804934193349852946
+        alpha (float, optional): The value of alpha(must be no less than zero) for selu. Default is 1.6732632423543772848170429916717
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -672,6 +672,14 @@ def selu(x,
             x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
             out = F.selu(x) # [[0, 1.050701],[2.101402, 3.152103]]
     """
+    if scale <= 1.0:
+        raise ValueError(
+            "The scale must be greater than 1.0. Received: {}.".format(scale))
+
+    if alpha < 0:
+        raise ValueError(
+            "The alpha must be no less than zero. Received: {}.".format(alpha))
+
     if in_dygraph_mode():
         return core.ops.selu(x, 'scale', scale, 'alpha', alpha)
 
@@ -1033,8 +1041,8 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
 
     .. math::
 
-        Out[i, j] = log(softmax(x)) 
-                  = log(\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
+        log\\_softmax[i, j] = log(softmax(x))
+                            = log(\\frac{\exp(X[i, j])}{\\sum_j(exp(X[i, j])})
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index cff108ec6a9a8666e0aa51ba0414fd885777f1a7..623af3277fba0e29fb77b02c711e258602f1f75a 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -17,7 +17,8 @@ import paddle
 from ...fluid.framework import in_dygraph_mode, default_main_program
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers.tensor import Variable, fill_constant, zeros, concat
-
+from ...fluid.layers import core
+from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
 from ...fluid.layers import label_smooth  #DEFINE_ALIAS
 from ...fluid import one_hot  #DEFINE_ALIAS
@@ -30,6 +31,10 @@ from ...fluid.layers import elementwise_mul  #DEFINE_ALIAS
 from ...tensor import clip
 from ...tensor import sum
 from ...tensor import sqrt
+from ...tensor import sum  #DEFINE_ALIAS
+from ...tensor import sqrt  #DEFINE_ALIAS
+from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
+from ...fluid.framework import Variable, in_dygraph_mode, _varbase_creator
 
 #from ...fluid.layers import fc  #DEFINE_ALIAS
 from ...fluid.layers import pad_constant_like  #DEFINE_ALIAS
@@ -46,6 +51,7 @@ __all__ = [
     #       'embedding',
     #       'fc',
     'label_smooth',
+    'linear',
     'one_hot',
     'pad',
     'pad_constant_like',
@@ -54,30 +60,28 @@ __all__ = [
     #       'bilinear_tensor_product',
     'assign',
     'interpolate',
+    'upsample',
     'bilinear',
     'cosine_similarity',
 ]
 
 
-def interpolate(input,
+def interpolate(x,
                 size=None,
                 scale_factor=None,
                 mode='nearest',
                 align_corners=False,
-                align_mode=1,
+                align_mode=0,
                 data_format='NCHW',
                 name=None):
     """
-	:alias_main: paddle.nn.functional.interpolate
-	:alias: paddle.nn.functional.interpolate,paddle.nn.functional.common.interpolate
 
     This op resizes a batch of images.
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
     and the resizing only applies on the three dimensions(depth, height and width).
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the
-    future and only use :attr:`out_shape` instead.
+
     Supporting resample methods:
         'linear' : Linear interpolation
         'bilinear' : Bilinear interpolation
@@ -102,7 +106,7 @@ def interpolate(input,
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
     The linear interpolation is performed on three directions.
-    Align_corners and align_mode are optional parameters,the calculation method
+    align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
 
     Bicubic interpolation is an extension of cubic interpolation for interpolating
@@ -132,18 +136,12 @@ def interpolate(input,
                 W_out = W_{in} * scale_{factor}
         
         Nearest neighbor interpolation:
-          if:
+
               align_corners = False
               input : (N,C,H_in,W_in)
               output: (N,C,H_out,W_out) where:
               H_out = floor (H_{in} * scale_{factor})
               W_out = floor (W_{in} * scale_{factor})
-          else:
-              align_corners = True
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = round(H_{in} * scale_{factor})
-              W_out = round(W_{in} * scale_{factor})
 
         Bilinear interpolation:
           if:
@@ -202,22 +200,22 @@ def interpolate(input,
     https://en.wikipedia.org/wiki/Bicubic_interpolation
     
     Parameters:
-        input (Variable): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
+        x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
-        size (list|tuple|Variable|None): Output shape of image resize
+        size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Variable|None): The multiplier for the input height or width. At
+        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
              least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
         align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the
                                input and output tensors are aligned, preserving the values at the
-                               corner pixels.
+                               corner pixels.This only has an effect when 'linear', 'bilinear', 'bicubic' or 'trilinear'.
                                Default: False
         align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the formula in the example above,
                             it can be \'0\' for src_idx = scale_factor*(dst_indx+0.5)-0.5 , can be \'1\' for
@@ -235,7 +233,7 @@ def interpolate(input,
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
     Raises:
-        TypeError: size should be a list or tuple or Variable.
+        TypeError: size should be a list or tuple or Tensor.
         ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
                     'trilinear', 'bicubic', or 'nearest' currently.
         ValueError: 'linear' only support 3-D tensor.
@@ -253,53 +251,27 @@ def interpolate(input,
     Examples:
         .. code-block:: python
 
-	    #declarative mode
 	    import paddle
 	    import numpy as np
-	    input = fluid.data(name="input", shape=[None,3,6,10])
-	    #1
-	    output = paddle.nn.functional.interpolate(input=input, size=[12,12])
-	    #2
-	    #x = np.array([2]).astype("int32")
-	    #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-	    #fluid.layers.assign(input=x, output=dim1)
-	    #output = paddle.nn.functional.interpolate(input=input, size=[12,dim1])
-	    #3
-	    #x = np.array([3,12]).astype("int32")
-	    #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
-	    #fluid.layers.assign(input=x, output=shape_tensor)
-	    #output = paddle.nn.functional.interpolate(input=input, size=shape_tensor)
-	    #4
-	    #x = np.array([0.5]).astype("float32")
-	    #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
-	    #fluid.layers.assign(x,scale_tensor)
-	    #output = paddle.nn.functional.interpolate(input=input, scale_factor=scale_tensor)
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
-
-	    input_data = np.random.rand(2,3,6,10).astype("float32")
-	    output_data = exe.run(fluid.default_main_program(),
-                feed={"input":input_data},
-                fetch_list=[output],
-                return_numpy=True)
-
-	    print(output_data[0].shape)
-	    #1
-	    # (2, 3, 12, 12)
-	    #2
-	    # (2, 3, 12, 2)
-	    #3
-	    # (2, 3, 3, 12)
-	    #4
-	    # (2, 3, 3, 5)
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = paddle.nn.functional.interpolate(input=input, size=[12,12])
-    		print(output.shape)
-		# [2L, 3L, 12L, 12L]
+            import paddle.nn.functional as F
+            paddle.disable_static()
+            
+            # given out size
+            input_data = np.random.rand(2,3,6,10).astype("float32")
+            x = paddle.to_tensor(input_data)
+            output_1 = F.interpolate(x=x, size=[12,12])
+    	    print(output_1.shape)
+	    # [2L, 3L, 12L, 12L]
+            
+            # given scale
+            output_2 = F.interpolate(x=x, scale_factor=[2,1])
+            print(output_2.shape)
+            # [2L, 3L, 12L, 10L]
+            
+            # bilinear interp
+            output_3 = F.interpolate(x=x, scale_factor=[2,1], mode="bilinear")
+            print(output_2.shape)
+            # [2L, 3L, 12L, 10L]
     """
     data_format = data_format.upper()
     resample = mode.upper()
@@ -317,13 +289,13 @@ def interpolate(input,
             "The 'resample' of image_resize can only be 'linaer', 'bilinear', 'trilinear', "
             " 'bicubic' or 'nearest' currently.")
 
-    if resample in ['LINEAR'] and len(input.shape) != 3:
+    if resample in ['LINEAR'] and len(x.shape) != 3:
         raise ValueError("'linear' only support 3-D tensor.")
 
-    if resample in ['BILINEAR', 'NEAREST', 'BICUBIC'] and len(input.shape) != 4:
+    if resample in ['BILINEAR', 'NEAREST', 'BICUBIC'] and len(x.shape) != 4:
         raise ValueError(
             "'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.")
-    if resample == 'TRILINEAR' and len(input.shape) != 5:
+    if resample == 'TRILINEAR' and len(x.shape) != 5:
         raise ValueError("'trilinear'only support 5-D tensor.")
 
     if size is None and scale_factor is None:
@@ -334,19 +306,21 @@ def interpolate(input,
 
     if align_mode != 0 and align_mode != 1:
         raise ValueError("align_mode can only be 0 or 1")
-
-    helper = LayerHelper('{}_interp'.format(resample_type), **locals())
+    if align_corners != 0 and resample == 'NEAREST':
+        raise ValueError(
+            "align_corners option can only be set with the interpolating modes: linear | bilinear | bicubic | trilinear"
+        )
+    helper = LayerHelper('{}_interp_v2'.format(resample_type), **locals())
     dtype = helper.input_dtype()
-
-    if len(input.shape) == 3 and data_format not in ['NCW', 'NWC']:
+    if len(x.shape) == 3 and data_format not in ['NCW', 'NWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
             " received but only `NCW` or `NWC` supported for 3-D input.")
-    elif len(input.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
+    elif len(x.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
             " received but only `NCHW` or `NHWC` supported for 4-D input.")
-    elif len(input.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
+    elif len(x.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
             " received but only `NCDHW` or `NDHWC` supported for 5-D input.")
@@ -359,7 +333,10 @@ def interpolate(input,
     if data_format == 'NHWC' or data_format == 'NDHWC' or data_format == 'NWC':
         data_layout = 'NHWC'
 
-    inputs = {"X": input}
+    if resample == 'NEAREST':
+        align_corners = False
+
+    inputs = {"X": x}
     attrs = {
         "out_d": -1,
         "out_h": -1,
@@ -408,7 +385,7 @@ def interpolate(input,
                         size_list.append(dim)
                 inputs['SizeTensor'] = new_size_tensor
 
-            if len(input.shape) == 3:
+            if len(x.shape) == 3:
                 if len(out_shape) != 1:
                     raise ValueError(
                         "out_shape length should be 2 for input 3-D tensor")
@@ -417,7 +394,7 @@ def interpolate(input,
                 else:
                     out_shape = list(map(int, out_shape))
                     attrs['out_w'] = out_shape[0]
-            if len(input.shape) == 4:
+            if len(x.shape) == 4:
                 if len(out_shape) != 2:
                     raise ValueError("out_shape length should be 2 for "
                                      "input 4-D tensor.")
@@ -428,7 +405,7 @@ def interpolate(input,
                     out_shape = list(map(int, out_shape))
                     attrs['out_h'] = out_shape[0]
                     attrs['out_w'] = out_shape[1]
-            if len(input.shape) == 5:
+            if len(x.shape) == 5:
                 if len(out_shape) != 3:
                     raise ValueError("out_shape length should be 3 for "
                                      "input 5-D tensor.")
@@ -449,36 +426,247 @@ def interpolate(input,
         elif isinstance(scale, float) or isinstance(scale, int):
             if scale <= 0:
                 raise ValueError("Attr(scale) should be greater than zero.")
-            attrs['scale'] = float(scale)
+            scale_list = []
+            for i in range(len(x.shape) - 2):
+                scale_list.append(scale)
+            attrs['scale'] = list(map(float, scale_list))
+        elif isinstance(scale, list):
+            if len(scale) != len(x.shape) - 2:
+                raise ValueError("scale_shape length should be {} for "
+                                 "input {}-D tensor.".format(
+                                     len(x.shape) - 2, len(x.shape)))
+            for value in scale:
+                if value <= 0:
+                    raise ValueError("Attr(scale) should be greater than zero.")
+            attrs['scale'] = list(map(float, scale))
         else:
             raise TypeError(
-                "Attr(scale)'s type should be float, int or Variable.")
+                "Attr(scale)'s type should be float, int, list or Tensor.")
 
+    if in_dygraph_mode():
+        attr_list = []
+        for k, v in attrs.items():
+            attr_list.append(k)
+            attr_list.append(v)
+        dy_attr = tuple(attr_list)
+
+        if resample_type == "linear":
+            out = core.ops.linear_interp_v2(x, *dy_attr)
+        if resample_type == "bilinear":
+            out = core.ops.bilinear_interp_v2(x, *dy_attr)
+        if resample_type == "trilinear":
+            out = core.ops.trilinear_interp_v2(x, *dy_attr)
+        if resample_type == "nearest":
+            out = core.ops.nearest_interp_v2(x, *dy_attr)
+        if resample_type == "bicubic":
+            out = core.ops.bicubic_interp_v2(x, *dy_attr)
+        return out
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
-        type='{}_interp'.format(resample_type),
+        type='{}_interp_v2'.format(resample_type),
         inputs=inputs,
         outputs={"Out": out},
         attrs=attrs)
     return out
 
 
-def bilinear(x1, x2, weight, bias=None, name=None):
+def upsample(x,
+             size=None,
+             scale_factor=None,
+             mode='nearest',
+             align_corners=False,
+             align_mode=0,
+             data_format='NCHW',
+             name=None):
     """
+    This op resizes a batch of images.
+    The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
+    or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
+    (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    and the resizing only applies on the three dimensions(depth, height and width).
 
-    This layer performs bilinear on two inputs.
+    Supporting resample methods:
+        'linear' : Linear interpolation
+        'bilinear' : Bilinear interpolation
+        'trilinear' : Trilinear interpolation
+        'nearest' : Nearest neighbor interpolation
+        'bicubic' : Bicubic interpolation
+    Linear interpolation is the method of using a line connecting two known quantities 
+    to determine the value of an unknown quantity between the two known quantities. 
+    
+    Nearest neighbor interpolation is to perform nearest neighbor interpolation
+    in both the 3rd dimension(in height direction) and the 4th dimension(in width
+    direction) on input tensor.
+    Bilinear interpolation is an extension of linear interpolation for
+    interpolating functions of two variables (e.g. H-direction and
+    W-direction in this op) on a rectilinear 2D grid. The key idea is
+    to perform linear interpolation first in one direction, and then
+    again in the other direction.
+    
+    Bicubic interpolation is an extension of cubic interpolation for interpolating
+    data points on a two-dimensional regular grid. The interpolated surface is
+    smoother than corresponding surfaces obtained by bilinear interpolation or
+    nearest-neighbor interpolation.
+    Trilinear interpolation is an extension of linear interpolation for
+    interpolating functions of three variables (e.g. D-direction,
+    H-direction and W-direction in this op) on a rectilinear 3D grid.
+    The linear interpolation is performed on three directions.
+    align_corners and align_mode are optional parameters,the calculation method
+    of interpolation can be selected by them.
+    Example:
+    .. code-block:: text
+        For scale_factor:
+            if align_corners = True && out_size > 1 :
+              scale_factor = (in_size-1.0)/(out_size-1.0)
+            else:
+              scale_factor = float(in_size/out_size)
+        Linear interpolation:
+            if:
+                align_corners = False , align_mode = 0
+                input : (N,C,W_in)
+                output: (N,C,W_out) where:
+                W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+            else:
+                input : (N,C,W_in)
+                output: (N,C,W_out) where:
+                W_out = W_{in} * scale_{factor}
+        Nearest neighbor interpolation:
+          if:
+              align_corners = False
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = floor (H_{in} * scale_{factor})
+              W_out = floor (W_{in} * scale_{factor})
+          else:
+              align_corners = True
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})
+        
+        Bilinear interpolation:
+          if:
+              align_corners = False , align_mode = 0
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+        Bicubic interpolation:
+          if:
+              align_corners = False
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+        Trilinear interpolation:
+          if:
+              align_corners = False , align_mode = 0
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+          else:
+              input : (N,C,D_in,H_in,W_in)
+              output: (N,C,D_out,H_out,W_out) where:
+              D_out = D_{in} * scale_{factor}
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+    https://en.wikipedia.org/wiki/Linear_interpolation.
+    For details of linear interpolation, please refer to Wikipedia:
+    
+    For details of nearest neighbor interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
+    
+    For details of bilinear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Bilinear_interpolation.
+    
+    For details of bicubic interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Bicubic_interpolation
+    
+    For details of trilinear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Trilinear_interpolation.
+    
+    Parameters:
+        x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
+                          its data format is specified by :attr:`data_format`.
+        size (list|tuple|Tensor|None): Output shape of image resize
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
+             If a Tensor Variable, its dimensions size should be a 1.
+        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
+             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             Default: None.
+        mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
+                       'bicubic' and 'trilinear' currently. Default: 'nearest'
+        align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the
+                               input and output tensors are aligned, preserving the values at the
+                               corner pixels.
+                               Default: False
+        align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the formula in the example above,
+                            it can be \'0\' for src_idx = scale_factor*(dst_indx+0.5)-0.5 , can be \'1\' for
+                            src_idx = scale_factor*dst_index.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
+            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+    Returns:
+        A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
+        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
+        or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
+    Raises:
+        TypeError: size should be a list or tuple or Tensor.
+        ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
+                    'trilinear', 'bicubic', or 'nearest' currently.
+        ValueError: 'linear' only support 3-D tensor.
+        ValueError: 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        ValueError: 'trilinear' only support 5-D tensor.
+        ValueError: One of size and scale_factor must not be None.
+        ValueError: size length should be 1 for input 3-D tensor.
+        ValueError: size length should be 2 for input 4-D tensor.
+        ValueError: size length should be 3 for input 5-D tensor.
+        ValueError: scale_factor should be greater than zero.
+        TypeError: align_corners should be a bool value
+        ValueError: align_mode can only be '0' or '1'
+        ValueError: data_format can only be 'NCW', 'NWC', 'NCHW', 'NHWC', 'NCDHW' or 'NDHWC'.
+        Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+            import paddle.nn.functional as F
+            paddle.disable_static()
+
+            input = paddle.to_tensor(input_data)
+            output = F.upsample(input=input, size=[12,12])
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+
+    """
+    return interpolate(x, size, scale_factor, mode, align_corners, align_mode,
+                       data_format)
 
-    .. math::
-      out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,size-1
-      out = out + b
 
-    In this formula:
-     - :math:`x1`: the first input contains in1_features elements, shape is [batch_size, in1_features].
-     - :math:`x2`: the second input contains in2_features elements, shape is [batch_size, in2_features].
-     - :math:`W_{i}`: the i-th learned weight, shape is [in1_features, in2_features], and learned weight's shape is [out_features, in1_features, in2_features].
-     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, out_features].
-     - :math:`b`: the learned bias, shape is [1, out_features].
-     - :math:`x2^\mathrm{T}`: the transpose of :math:`x2`.
+def bilinear(x1, x2, weight, bias=None, name=None):
+    """
+
+    This layer performs bilinear on two inputs.
+    See :ref:`api_nn_Bilinear` for details and output shape.
 
     Parameters:
        x1 (Tensor): the first input tensor, it's data type should be float32, float64.
@@ -489,7 +677,7 @@ def bilinear(x1, x2, weight, bias=None, name=None):
            to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
 
     Returns:
-       Variable: A 2-D Tensor of shape [batch_size, out_features].
+       Tensor: A 2-D Tensor of shape [batch_size, out_features].
 
     Examples:
        .. code-block:: python
@@ -1042,7 +1230,19 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
     x_dim = len(x.shape)
 
-    original_data_format = data_format
+    assert x_dim in [
+        3, 4, 5
+    ], "input tesor dimension must be in [3, 4, 5] but got {}".format(x_dim)
+
+    supported_format_map = {
+        3: ["NCL", "NLC"],
+        4: ["NCHW", "NHWC"],
+        5: ["NCDHW", "NDHWC"],
+    }
+    assert data_format in supported_format_map[x_dim], \
+    "input tensor dimension is {}, it's data format should be in {} but got {}".format(
+        x_dim, supported_format_map[x_dim], data_format)
+
     unsqueezed_dim = []
 
     if isinstance(pad, Variable):
@@ -1166,3 +1366,83 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
     n12 = sqrt(clip(w1 * w2, min=eps * eps))
     cos_sim = w12 / n12
     return cos_sim
+
+
+def linear(x, weight, bias=None, name=None):
+    """
+
+    Fully-connected linear transformation op
+
+    .. math::
+
+        Out = {XW + b}
+
+    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
+
+    The linear op multiplies input tensor with weight matrix and
+    produces an output Tensor of shape [N, *, output_dim], 
+    where N is batch size and `*` means any number of additional dimensions and output_dim is the last dim of ``weight``.
+    If ``bias`` is not None, a bias will be added to the output.
+
+    Args:
+        x(Tensor): Input tensor, its data type is float16, float32 or float64
+        weight(Tensor): Weight tensor, its data type is float16, float32 or float64
+        bias(Tensor|None, optional): Bias tensor, its data type is float16, float32 or float64. If it is set to None, no bias will be added to the output units.
+        name(str|None, optional): For detailed information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Returns:
+        Output tensor
+
+    Examples:
+        .. code-block:: python
+          
+          import numpy as np
+          import paddle
+          import paddle.nn.functional as F
+          
+          input = np.ones((3,1,2), dtype=np.float32)
+          weight = np.ones((2,2), dtype=np.float32)
+          bias = np.ones((2), dtype=np.float32)
+          place = paddle.CPUPlace()
+          paddle.disable_static(place)
+          input = paddle.to_tensor(input)
+          weight = paddle.to_tensor(weight)
+          bias = paddle.to_tensor(bias)
+          out = F.linear(input, weight, bias)
+          print(out) #[3 3 3 3 3 3]
+    
+    """
+    if in_dygraph_mode():
+        pre_bias = _varbase_creator(dtype=x.dtype)
+        core.ops.matmul(x, weight, pre_bias, 'transpose_X', False,
+                        'transpose_Y', False, "alpha", 1)
+        return dygraph_utils._append_bias_in_dygraph(
+            pre_bias, bias, axis=len(x.shape) - 1)
+    else:
+        helper = LayerHelper('linear', **locals())
+        dtype = x.dtype
+
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'linear')
+        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear')
+
+        inputs = {'X': [x], 'Y': [weight]}
+        attrs = {
+            'transpose_X': False,
+            'transpose_Y': False,
+            'alpha': 1,
+        }
+        tmp = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='matmul', inputs=inputs, outputs={'Out': tmp}, attrs=attrs)
+        if bias is not None:
+            res = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [tmp],
+                        'Y': [bias]},
+                outputs={'Out': [res]},
+                attrs={'axis': len(x.shape) - 1})
+        else:
+            res = tmp
+        return res
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index f80f200c7163836252faa4b1c932178f6bab0dff..42d7d98aefcbbf51f562b98c4c494aeccfe20cf2 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -158,7 +158,7 @@ def conv1d(x,
         bias (Tensor, optional): The bias with shape [M,]. Default: None.
         stride (int or tuple, optional): The stride size. If stride is a tuple, it must
             contain one integers, (stride_size). Default: 1.
-        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+        padding(int|str|tuple|list, optional): The padding size. Padding could be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means the feature map is zero paded by size of `padding` on both sides.
             3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
@@ -185,7 +185,7 @@ def conv1d(x,
         same with input.
 
     Raises:
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `data_format` is not "NCL" or "NLC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
@@ -238,7 +238,7 @@ def conv1d(x,
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -260,7 +260,7 @@ def conv1d(x,
         padding = padding + [0]
     else:
         raise ValueError(
-            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            "The size of padding's dimension should be 1 or 2. But got padding={}".
             format(padding))
 
     stride = utils.convert_to_list(stride, 1, 'stride') + [1]
@@ -350,7 +350,7 @@ def conv2d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -377,7 +377,7 @@ def conv2d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
             H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
             W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
@@ -424,7 +424,7 @@ def conv2d(x,
 
     Raises:
         ValueError: If `data_format` is not "NCHW" or "NHWC".
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
@@ -465,7 +465,7 @@ def conv2d(x,
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -710,7 +710,7 @@ def conv_transpose1d(x,
 
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -728,7 +728,7 @@ def conv_transpose1d(x,
         padding = padding + [0]
     else:
         raise ValueError(
-            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            "The size of padding's dimension should 1 or 2. But got padding={}".
             format(padding))
 
     stride = utils.convert_to_list(stride, 1, 'stride') + [1]
@@ -807,10 +807,10 @@ def conv_transpose2d(x,
                      stride=1,
                      padding=0,
                      output_padding=0,
-                     groups=1,
                      dilation=1,
-                     data_format='NCHW',
+                     groups=1,
                      output_size=None,
+                     data_format='NCHW',
                      name=None):
     """
 
@@ -829,7 +829,7 @@ def conv_transpose2d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -856,7 +856,7 @@ def conv_transpose2d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
            H^\prime_{out} &= (H_{in} - 1) * strides[0] - pad_height_top - pad_height_bottom + dilations[0] * (H_f - 1) + 1 \\\\
            W^\prime_{out} &= (W_{in} - 1) * strides[1] - pad_width_left - pad_width_right + dilations[1] * (W_f - 1) + 1 \\\\
@@ -883,28 +883,27 @@ def conv_transpose2d(x,
         stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
             If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
-             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
-             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
-             If `padding` is a tuple or list, it could be in three forms:
-             `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and
-            when `data_format` is `'NCHW'`,
-            `padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `'NHWC'`, `padding` can be in the form
+        padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or 
+            'SAME' which is the padding algorithm. If padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or 
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCHW"`, `pool_padding` can be in the form 
+            `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form 
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
-            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups = 1.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
+            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
             tuple, it must contain two integers, (image_height, image_width). None if use
             filter_size, padding, and stride to calculate output_size.
@@ -950,7 +949,7 @@ def conv_transpose2d(x,
           paddle.disable_static()
           x_var = paddle.to_tensor(x)
           w_var = paddle.to_tensor(w)
-          y_var = F.conv2d_transpose(x_var, w_var)
+          y_var = F.conv_transpose2d(x_var, w_var)
           y_np = y_var.numpy()
           print(y_np.shape)
 
@@ -966,7 +965,7 @@ def conv_transpose2d(x,
     channel_dim = -1 if channel_last else 1
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
+        raise ValueError("The channel dimension of the input({}) "
                          "should be defined. Received: {}.".format(
                              x.shape, num_channels))
     if num_channels % groups != 0:
@@ -1070,7 +1069,7 @@ def conv3d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -1096,7 +1095,7 @@ def conv3d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
             D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
             H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
@@ -1147,7 +1146,7 @@ def conv3d(x,
 
     Raises:
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
@@ -1160,20 +1159,18 @@ def conv3d(x,
     Examples:
         .. code-block:: python
 
-            from paddle import fluid
-            import paddle.nn.functional as F
-            import paddle.fluid.dygraph as dg
             import numpy as np
+            import paddle
+            import paddle.nn.functional as F
 
             x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
             w = np.random.randn(6, 3, 3, 3, 3).astype(np.float32)
 
-            place = fluid.CPUPlace()
-            with dg.guard(place):
-                x_var = dg.to_variable(x)
-                w_var = dg.to_variable(w)
-                y_var = F.conv3d(x_var, w_var, act="relu")
-                y_np = y_var.numpy()
+            paddle.disable_static()
+            x_var = paddle.to_tensor(x)
+            w_var = paddle.to_tensor(w)
+            y_var = F.conv3d(x_var, w_var)
+            y_np = y_var.numpy()
             print(y_np.shape)
 
             # (2, 6, 6, 6, 6)
@@ -1190,7 +1187,7 @@ def conv3d(x,
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimmention of the input({}) should be defined. "
+            "The channel dimension of the input({}) should be defined. "
             "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
@@ -1260,8 +1257,8 @@ def conv_transpose3d(x,
                      output_padding=0,
                      groups=1,
                      dilation=1,
-                     data_format='NCDHW',
                      output_size=None,
+                     data_format='NCDHW',
                      name=None):
     """
     The convolution3d transpose layer calculates the output based on the input,
@@ -1279,7 +1276,7 @@ def conv_transpose3d(x,
 
     For each input :math:`X`, the equation is:
 
-    .. math::
+    ..  math::
 
         Out = \sigma (W \\ast X + b)
 
@@ -1306,7 +1303,7 @@ def conv_transpose3d(x,
 
         Where
 
-        .. math::
+        ..  math::
 
            D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
            H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
@@ -1338,37 +1335,37 @@ def conv_transpose3d(x,
             If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
             stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
             Default: stride = 1.
-        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively
-             adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
-             either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
-             is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `'NCDHW'`, `padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
             `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `'NDHWC'`, `padding` can be in the form
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
-            Default: dilation = 1.
         groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups=1
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
-            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            Default: dilation = 1.
         output_size(int|list|tuple, optional): The output image size. If output size is a
             tuple, it must contain three integers, (image_depth, image_height, image_width). This
             parameter only works when filter_size is None. If output_size and filter_size are 
             specified at the same time, They should follow the formula above. Default: None. 
             Output_size and filter_size should not be None at the same time.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default.
@@ -1425,7 +1422,7 @@ def conv_transpose3d(x,
     num_filters = weight.shape[1]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimmention of the input({}) should be defined. "
+            "The channel dimension of the input({}) should be defined. "
             "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 55bb36d136405385a88b991576c2a9091437d456..f1509143f3c933db12fc4ab6afd1a00b291f38f4 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -784,30 +784,30 @@ def kl_div(input, label, reduction='mean', name=None):
             import numpy as np
             import paddle.nn.functional as F
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
             shape = (5, 20)
             input = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
 
             # 'batchmean' reduction, loss shape will be [N]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='batchmean')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='batchmean')
             # shape=[5]
 
             # 'mean' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='mean')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='mean')
             # shape=[1]
 
             # 'sum' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='sum')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='sum')
             # shape=[1]
 
             # 'none' reduction, loss shape is same with input shape
-            pred_loss = F.kl_div(paddle.to_variable(input),
-                                 paddle.to_variable(target), reduction='none')
+            pred_loss = F.kl_div(paddle.to_tensor(input),
+                                 paddle.to_tensor(target), reduction='none')
             # shape=[5, 20]
 
     """
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 13e86e5712a1cd5c014517e37d3803ca24cfb6fb..e9c1a21ecffb1b64cb5ae9e6b802600625cb4685 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -54,8 +54,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     Args:
         x (Tensor): The input tensor could be N-D tensor, and the input data type could be float32 or float64.
         p (float|int, optional): The exponent value in the norm formulation. Default: 2
-        axis (int, optional): The axis on which to apply normalization. If ``x`` is 1-D tensor, ``axis`` is fixed to 0. If `axis < 0`, \
-            the dimension to normalization is `x.ndim + axis`. -1 is the last dimension.
+        axis (int, optional): The axis on which to apply normalization. If `axis < 0`, the dimension to normalization is `x.ndim + axis`. -1 is the last dimension. 
         epsilon (float, optional): Small float added to denominator to avoid dividing by zero. Default is 1e-12.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -72,7 +71,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
             paddle.disable_static()
             x = np.arange(6, dtype=np.float32).reshape(2,3)
-            x = paddle.to_variable(x)
+            x = paddle.to_tensor(x)
             y = F.normalize(x)
             print(y.numpy())
             # [[0.         0.4472136  0.8944272 ]
@@ -88,8 +87,6 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
             # [[0.         0.24253564 0.37139067]
             # [1.         0.97014254 0.9284767 ]]
     """
-    if len(x.shape) == 1:
-        axis = 0
     if in_dygraph_mode():
         eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
         out = core.ops.p_norm(x, 'axis', axis, 'porder',
@@ -99,6 +96,10 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     check_type(p, 'p', (float, int), 'normalize')
     check_type(axis, 'axis', (int), 'normalize')
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'normalize')
+    if len(x.shape) == 1 and axis != 0 and axis != -1:
+        raise ValueError(
+            "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}".
+            format(axis))
 
     attrs = {
         'axis': axis,
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index ca657b8be3e67c7acb795a0f427ca5fe2c57b1f2..c8790a75901fd5d9a38862158246e3756dc575c4 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -18,124 +18,146 @@ from ...fluid.layers import pool3d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool2d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool3d  #DEFINE_ALIAS
 from ...fluid import core
-from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
-from ...fluid.layers import utils, LayerHelper
-from ...fluid.data_feeder import check_type, check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from ...fluid.layers import unsqueeze, squeeze
+from ...fluid.framework import in_dygraph_mode
+from ...fluid.layers import utils, LayerHelper, unsqueeze, squeeze
+from ...fluid.data_feeder import check_type, check_variable_and_dtype
 
 __all__ = [
     'pool2d',
     'pool3d',
+    'adaptive_pool2d',
+    'adaptive_pool3d',
     'avg_pool1d',
+    'avg_pool2d',
+    'avg_pool3d',
     'max_pool1d',
+    'max_pool2d',
+    'max_pool3d',
     'adaptive_avg_pool1d',
-    'adaptive_max_pool1d',
     'adaptive_avg_pool2d',
     'adaptive_avg_pool3d',
-    'adaptive_pool2d',
-    'adaptive_pool3d',
-    'max_pool2d',
-    'avg_pool2d',
-    'max_pool3d',
-    'avg_pool3d',
+    'adaptive_max_pool1d',
+    'adaptive_max_pool2d',
+    'adaptive_max_pool3d',
 ]
 
 
-def check_input(x, dimension):
+def _is_list_or_tuple(input):
+    return isinstance(input, (list, tuple))
+
+
+def _check_input(x, dimension):
     if len(x.shape) != dimension:
-        raise ValueError("Excepted Input X is 3-D tensor, but received {}-D {}".
-                         format(len(x.shape), type(x)))
+        raise ValueError(
+            "Excepted Input X is {}-D tensor, but received {}-D {}".format(
+                dimension, len(x.shape), type(x)))
 
 
-def check_instance(x, x_name, types=(int, float)):
+def _check_instance(x, x_name, types=(int, float)):
 
     if not isinstance(x, types):
         raise ValueError("Excepted {} type for {} but received type: {}. ".
                          format(types, x_name, type(x)))
 
 
-def update_padding1d(padding, pool_type='avg'):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, list) or isinstance(ele, tuple):
-            return True
-        return False
-
-    if is_list_or_tuple(padding):
-        if padding.__len__() == 1 and not is_list_or_tuple(padding[0]):
-            return [0, padding[0]]
-        else:
-            raise ValueError(
-                "{}_pool1d() argument 'padding' should contain one int (got {})".
-                format(pool_type, padding.__len__()))
+def _zero_padding_in_batch_and_channel(padding, channel_last):
+    if channel_last:
+        return list(padding[0]) == [0, 0] and list(padding[-1]) == [0, 0]
     else:
-        padding = [0, padding]
+        return list(padding[0]) == [0, 0] and list(padding[1]) == [0, 0]
 
-    return padding
 
+def _exclude_padding_in_batch_and_channel(padding, channel_last):
+    padding_ = padding[1:-1] if channel_last else padding[2:]
+    padding_ = [elem for pad_a_dim in padding_ for elem in pad_a_dim]
+    return padding_
 
-def update_padding2d(padding, data_format):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, list) or isinstance(ele, tuple):
-            return True
-        return False
-
-    if is_list_or_tuple(padding) and len(padding) == 4:
-        if is_list_or_tuple(padding[0]) and (data_format == "NCHW"):
-            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
-                raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[2:4]
-            padding = [ele for a_list in padding for ele in a_list]
-        elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
-            if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
-                raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[1:3]
-            padding = [ele for a_list in padding for ele in a_list]
-        padding = utils.convert_to_list(padding, 4, 'padding')
-
-        if utils._is_symmetric_padding(padding, 2):
-            padding = [padding[0], padding[2]]
-    else:
-        padding = utils.convert_to_list(padding, 2, 'padding')
-
-    return padding
 
+def _channel_last(data_format, num_dims):
+    if num_dims == 1:
+        if data_format not in ['NCL', 'NLC']:
+            raise ValueError(
+                "Attr(data_format) should be 'NCL' or 'NLC'. Received "
+                "Attr(data_format): %s" % str(data_format))
+        else:
+            return True if data_format == "NLC" else False
+    if num_dims == 2:
+        if data_format not in ['NCHW', 'NHWC']:
+            raise ValueError(
+                "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+                "Attr(data_format): %s" % str(data_format))
+        else:
+            return True if data_format == "NHWC" else False
+    if num_dims == 3:
+        if data_format not in ['NCDHW', 'NDHWC']:
+            raise ValueError(
+                "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+                "Attr(data_format): %s" % str(data_format))
+        else:
+            return True if data_format == "NDHWC" else False
 
-def update_padding3d(padding, data_format):
-    def is_list_or_tuple(ele):
-        if isinstance(ele, (list, tuple)):
-            return True
-        return False
 
-    if is_list_or_tuple(padding) and len(padding) == 5:
-        if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"):
-            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
+                format(padding))
+        if padding == "VALID":
+            if ceil_mode != False:
                 raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[2:5]
-            padding = [ele for a_list in padding for ele in a_list]
-        elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
-            if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+
+            padding_algorithm = "VALID"
+            padding = [0] * num_dims
+        else:
+            padding_algorithm = "SAME"
+            padding = [0] * num_dims
+    elif _is_list_or_tuple(padding):
+        # for padding like
+        # [(pad_before, pad_after), (pad_before, pad_after), ...]
+        # padding for batch_dim and channel_dim included
+        if len(padding) == 2 + num_dims and _is_list_or_tuple(padding[0]):
+            if not _zero_padding_in_batch_and_channel(padding, channel_last):
                 raise ValueError(
-                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                    "is not supported." % str(padding))
-            padding = padding[1:4]
-            padding = [ele for a_list in padding for ele in a_list]
-        padding = utils.convert_to_list(padding, 6, 'padding')
-        if utils._is_symmetric_padding(padding, 3):
-            padding = [padding[0], padding[2], padding[4]]
-
-    elif is_list_or_tuple(padding) and len(padding) == 6:
-        padding = utils.convert_to_list(padding, 6, 'padding')
-        if utils._is_symmetric_padding(padding, 3):
-            padding = [padding[0], padding[2], padding[4]]
+                    "Non-zero padding({}) in the batch or channel dimensions "
+                    "is not supported.".format(padding))
+            padding_algorithm = "EXPLICIT"
+            padding = _exclude_padding_in_batch_and_channel(padding,
+                                                            channel_last)
+            if utils._is_symmetric_padding(padding, num_dims):
+                padding = padding[0::2]
+        # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
+        elif len(padding) == 2 * num_dims and isinstance(padding[0], int):
+            padding_algorithm = "EXPLICIT"
+            padding = utils.convert_to_list(padding, 2 * num_dims, 'padding')
+            if utils._is_symmetric_padding(padding, num_dims):
+                padding = padding[0::2]
+        # for padding like [pad_d1, pad_d2, ...]
+        elif len(padding) == num_dims and isinstance(padding[0], int):
+            padding_algorithm = "EXPLICIT"
+            padding = utils.convert_to_list(padding, num_dims, 'padding')
+        else:
+            raise ValueError("Invalid padding: {}".format(padding))
+    # for integer padding
     else:
-        padding = utils.convert_to_list(padding, 3, 'padding')
+        padding_algorithm = "EXPLICIT"
+        padding = utils.convert_to_list(padding, num_dims, 'padding')
+    return padding, padding_algorithm
+
 
+def _expand_low_nd_padding(padding):
+    #1d to 2d fake input
+    if len(padding) == 2:
+        padding = [0] * 2 + padding
+    elif len(padding) == 1:
+        padding = [0] + padding
+    else:
+        raise ValueError(
+            "The size of padding's dimmention should be 1 or 2. But got padding={}".
+            format(padding))
     return padding
 
 
@@ -146,73 +168,57 @@ def avg_pool1d(x,
                count_include_pad=True,
                ceil_mode=False,
                name=None):
-    """
-
-    This operation applies a 1D average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
-
-    ..  math::
-
-       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
-
+    """ 
+    This API implements average pooling 1d operation,
+    See more details in :ref:`api_nn_pooling_AvgPool1d` .
 
     Args:
         x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
                           shape [N, C, L]. where `N` is batch size, `C` is the number of channels,
-                          `L` is the length of the feature. The data type if float32 or float64.
+                          `L` is the length of the feature. The data type is float32 or float64.
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+            it must contain an integer.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
-            then the input is implicitly zero-padded on both sides for padding number of points.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
+                          mode, default is `True`.
         ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
-            If it is set to False, the floor function will be used. Default False
+            If it is set to False, the floor function will be used. The default value is False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
+        ValueError: If `padding` is a list or tuple but its length is greater than 1.
+        ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
 
-
     Examples:
-
         .. code-block:: python
-
           import paddle
           import paddle.nn.functional as F
           paddle.disable_static()
-
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          pool_out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
-          # pool_out shape: [1, 3, 16]
-
+          out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # out shape: [1, 3, 16]
     """
     """NCL to NCHW"""
     data_format = "NCHW"
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'avg_pool1d')
-    check_input(x, 3)
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool1d')
+    _check_input(x, 3)
     x = unsqueeze(x, [2])
-    kernel_size = utils.convert_to_list(kernel_size, 1, 'pool_size')
+    kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
     kernel_size = [1] + kernel_size
     if stride is None:
         stride = kernel_size
@@ -220,33 +226,20 @@ def avg_pool1d(x,
         stride = utils.convert_to_list(stride, 1, 'pool_stride')
         stride = [1] + stride
 
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0]
+    channel_last = _channel_last("NCL", 1)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, channel_last=channel_last, ceil_mode=ceil_mode)
 
-    padding = update_padding1d(padding, "avg")
+    # use 2d to implenment 1d should expand padding in advance.
+    padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
         output = core.ops.pool2d(
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
             False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', not count_include_pad, 'ceil_mode',
-            ceil_mode, 'use_mkldnn', False, 'exclusive', True, 'data_format',
-            data_format)
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
         return squeeze(output, [2])
 
     op_type = 'pool2d'
@@ -275,126 +268,103 @@ def avg_pool1d(x,
     return squeeze(pool_out, [2])
 
 
-def max_pool1d(x,
+def avg_pool2d(x,
                kernel_size,
                stride=None,
                padding=0,
-               return_indices=False,
                ceil_mode=False,
+               count_include_pad=True,
+               divisor_override=None,
+               data_format="NCHW",
                name=None):
     """
-
-    Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
-
-    ..  math::
-
-       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
-
+    This API implements average pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_AvgPool2d` .
+ 
     Args:
-        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
-                          shape [N, C, L], where `N` is batch size, `C` is the number of channels,
-                          `L` is the length of the feature. The data type if float32 or float64.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
-            If it is set to False, the floor function will be used. Default False.
+        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If it is a tuple or list,
+            it must contain two integers, (kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The stride size. If it is a tuple or list,
+            it must contain two integers, (stride_Height, stride_Width).
+            Otherwise, the stride size will be a square of an int.
+
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
-
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
         ShapeError: If the output's shape calculated is not greater than 0.
-
-
     Examples:
-
         .. code-block:: python
-
           import paddle
           import paddle.nn.functional as F
+          import numpy as np
           paddle.disable_static()
-
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
-          # pool_out shape: [1, 3, 16]
-
-          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
-          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
-
+          # avg pool2d
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          out = F.avg_pool2d(x,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          # out.shape [1, 3, 16, 16]
     """
-    """NCL to NCHW"""
-    data_format = "NCHW"
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'max_pool1d')
-    check_input(x, 3)
-    x = unsqueeze(x, [2])
-    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
     if stride is None:
         stride = kernel_size
     else:
-        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
-
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0]
+        stride = utils.convert_to_list(stride, 2, 'pool_stride')
 
-    padding = update_padding1d(padding, 'max')
+    channel_last = _channel_last(data_format, 2)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 2, channel_last, ceil_mode=ceil_mode)
 
     if in_dygraph_mode():
-        pool_out = core.ops.max_pool2d_with_index(
-            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
-            'paddings', padding, 'padding_algorithm', padding_algorithm,
-            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
-            'exclusive', True, 'data_format', data_format)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+        output = core.ops.pool2d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
+            False, 'padding_algorithm', padding_algorithm, 'strides', stride,
+            'paddings', padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            _check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1]) / divisor_override
 
-    op_type = 'max_pool2d_with_index'
+    op_type = 'pool2d'
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
-    mask = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out, "Mask": mask}
 
     helper.append_op(
         type=op_type,
         inputs={"X": x},
-        outputs=outputs,
+        outputs={"Out": pool_out},
         attrs={
-            "pooling_type": 'max',
+            "pooling_type": "avg",
             "ksize": kernel_size,
             "global_pooling": False,
             "strides": stride,
@@ -403,335 +373,211 @@ def max_pool1d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": True,
+            "exclusive": not count_include_pad,
             "data_format": data_format,
         })
 
-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
-
-
-def adaptive_avg_pool1d(x, output_size, name=None):
-    """
-
-    This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For average adaptive pool1d:
-
-    ..  math::
-
-        lstart &= floor(i * L_{in} / L_{out})
-
-        lend &= ceil((i + 1) * L_{in} / L_{out})
-
-        Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
-
-    Args:
-        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
-                              with shape [N, C, L].  The format of input tensor is NCL,
-                              where N is batch size, C is the number of channels, L is the
-                              length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
-        name(str, optional): For detailed information, please refer
-                                 to :ref:`api_guide_Name`. Usually name is no need to set and
-                                 None by default.
-
-    Returns:
-            Tensor: The output tensor of adaptive average pooling result. The data type is same
-                      as input tensor.
-
-    Raises:
-            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
-
-    Examples:
-        .. code-block:: python
-
-              # average adaptive pool1d
-              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-              # output shape is [N, C, m], adaptive pool divide L dimension
-              # of input data into m grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(m):
-              #         lstart = floor(i * L / m)
-              #         lend = ceil((i + 1) * L / m)
-              #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-              #
-              import paddle
-              import paddle.nn.functional as F
-              paddle.disable_static()
-
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-              pool_out = F.adaptive_average_pool1d(data, output_size=16)
-              # pool_out shape: [1, 3, 16])
-    """
-    pool_type = 'avg'
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
-                             'adaptive_pool2d')
-    check_input(x, 3)
-    check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
-
-    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
-
-    l_type = "pool2d"
-    x = unsqueeze(x, [2])
-    if in_dygraph_mode():
-        pool_out = core.ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
-                                   pool_size, 'adaptive', True)
-        return squeeze(pool_out, [2])
-
-    helper = LayerHelper(l_type, **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-
-    outputs = {"Out": pool_out}
-    helper.append_op(
-        type=l_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
-        })
-
-    return squeeze(pool_out, [2])
+    if divisor_override is None:
+        return pool_out
+    else:
+        _check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
 
 
-def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+def avg_pool3d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               ceil_mode=False,
+               count_include_pad=False,
+               divisor_override=None,
+               data_format="NCDHW",
+               name=None):
     """
-    This operation applies a 1D adaptive max pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For max adaptive pool1d:
-
-    ..  math::
-
-        lstart &= floor(i * L_{in} / L_{out})
-
-        lend &= ceil((i + 1) * L_{in} / L_{out})
-
-        Output(i) &= max(Input[lstart:lend])}
+    This API implements average pooling 3d operation.
+    See more details in :ref:`api_nn_pooling_AvgPool3d` .
 
     Args:
-        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
-                              with shape [N, C, L].  The format of input tensor is NCL,
-                              where N is batch size, C is the number of channels, L is the
-                              length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
-        return_indices (bool): If true, the index of max pooling point will be returned along
-                with outputs. It cannot be set in average pooling type. Default False.
+        x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
+                          shape [N, C, D, H, W], where `N` represents the batch size, `C` represents
+                          the number of channels, `D`, `H` and `W` represent the depth, height and width of the feature respectively.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
-                                 to :ref:`api_guide_Name`. Usually name is no need to set and
-                                 None by default.
-
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
     Returns:
-            Tensor: The output tensor of adaptive pooling result. The data type is same
-                      as input tensor.
-
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
     Raises:
-            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
-
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
     Examples:
         .. code-block:: python
-
-              # max adaptive pool1d
-              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-              # output shape is [N, C, m], adaptive pool divide L dimension
-              # of input data into m grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(m):
-              #         lstart = floor(i * L / m)
-              #         lend = ceil((i + 1) * L / m)
-              #         output[:, :, i] = max(input[:, :, lstart: lend])
-              #
-              import paddle
-              import paddle.nn.functional as F
-              paddle.disable_static()
-
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-              pool_out = F.adaptive_max_pool1d(data, output_size=16)
-              # pool_out shape: [1, 3, 16])
-
-              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
-              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
-
+          import paddle.fluid as fluid
+          import paddle
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          # avg pool3d
+          out = paddle.nn.functional.avg_pool3d(
+                                            x,
+                                            kernel_size = 2,
+                                            stride = 2,
+                                            padding=0)
+          # out.shape: [1, 3, 16, 16, 16]
     """
-    pool_type = 'max'
-    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
-                             'adaptive_max_pool1d')
-    check_input(x, 3)
-    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
-    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
-
-    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
 
-    l_type = 'max_pool2d_with_index'
+    channel_last = _channel_last(data_format, 3)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
 
-    x = unsqueeze(x, [2])
     if in_dygraph_mode():
-        pool_out = core.ops.max_pool2d_with_index(
-            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+        output = core.ops.pool3d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
+            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            _check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1] *
+                             kernel_size[2]) / divisor_override
 
-    helper = LayerHelper(l_type, **locals())
+    op_type = "pool3d"
+    helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
-
-    mask = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out, "Mask": mask}
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out}
 
     helper.append_op(
-        type=l_type,
+        type=op_type,
         inputs={"X": x},
         outputs=outputs,
         attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
         })
 
-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+    if divisor_override is None:
+        return pool_out
+    else:
+        _check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1] *
+                           kernel_size[2]) / divisor_override
 
 
-def max_pool2d(x,
+def max_pool1d(x,
                kernel_size,
                stride=None,
                padding=0,
                return_indices=False,
                ceil_mode=False,
-               data_format="NCHW",
                name=None):
     """
-    This operation applies 2D max pooling over input feature based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-           stride: stride
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
-                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                                                   \text{stride[1]} \times w + n)
-           $$
+    This API implements max pooling 1d opereation.
+    See more details in :ref:`api_nn_pooling_MaxPool1d` .
 
     Args:
-        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
-                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
-                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
-                          `H` is the height of the feature, and `W` is the width of the
-                          feature. The data type if float32 or float64.
+        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
+                          shape [N, C, L], where `N` is batch size, `C` is the number of channels,
+                          `L` is the length of the feature. The data type if float32 or float64.
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int.
+            it must contain an integer.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An integer, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
+
     Examples:
         .. code-block:: python
           import paddle
           import paddle.nn.functional as F
-          import numpy as np
           paddle.disable_static()
-
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          output = F.max_pool2d(input,
-                                kernel_size=2,
-                                stride=2, padding=0)
-          # output.shape [1, 3, 16, 16]
-
-          # for return_indices=True
-          output, max_indices = F.max_pool2d(input,
-                                             kernel_size=2,
-                                             stride=2,
-                                             padding=0,
-                                             return_indices=True)
-          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # pool_out shape: [1, 3, 16]
+          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
+          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
     """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
-    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
+    """NCL to NCHW"""
+    data_format = "NCHW"
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool1d')
+    _check_input(x, 3)
+    x = unsqueeze(x, [2])
+    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
     if stride is None:
         stride = kernel_size
     else:
-        stride = utils.convert_to_list(stride, 2, 'pool_stride')
+        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
 
-    if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0]
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, ceil_mode=ceil_mode)
 
-    padding = update_padding2d(padding, data_format)
+    # use 2d to implenment 1d should expand padding in advance.
+    padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
-        output = core.ops.max_pool2d_with_index(
+        pool_out = core.ops.max_pool2d_with_index(
             x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
             'paddings', padding, 'padding_algorithm', padding_algorithm,
             'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
             'exclusive', True, 'data_format', data_format)
-        return output if return_indices else output[0]
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
 
     op_type = 'max_pool2d_with_index'
     helper = LayerHelper(op_type, **locals())
@@ -758,36 +604,21 @@ def max_pool2d(x,
             "data_format": data_format,
         })
 
-    return (pool_out, mask) if return_indices else pool_out
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
 
 
-def avg_pool2d(x,
+def max_pool2d(x,
                kernel_size,
                stride=None,
                padding=0,
+               return_indices=False,
                ceil_mode=False,
-               count_include_pad=True,
-               divisor_override=None,
                data_format="NCHW",
                name=None):
     """
-    This operation applies 2D average pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
-           $$
+    This API implements max pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_MaxPool2d` .
 
     Args:
         x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
@@ -796,30 +627,26 @@ def avg_pool2d(x,
                           `H` is the height of the feature, and `W` is the width of the
                           feature. The data type if float32 or float64.
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
+            it must contain two integers, (kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            it must contain two integers, (stride_Height, stride_Width).
             Otherwise, the pool stride size will be a square of an int.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
     Raises:
@@ -832,87 +659,71 @@ def avg_pool2d(x,
           import paddle.nn.functional as F
           import numpy as np
           paddle.disable_static()
-
-          # avg pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          output = F.avg_pool2d(input,
+          # max pool2d
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          out = F.max_pool2d(x,
                                 kernel_size=2,
                                 stride=2, padding=0)
           # output.shape [1, 3, 16, 16]
-
+          # for return_indices=True
+          out, max_indices = F.max_pool2d(x,
+                                             kernel_size=2,
+                                             stride=2,
+                                             padding=0,
+                                             return_indices=True)
+          # out.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
     kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
     if stride is None:
         stride = kernel_size
     else:
         stride = utils.convert_to_list(stride, 2, 'pool_stride')
 
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0]
-
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
             "Attr(data_format): %s." % str(data_format))
-    pool_padding = update_padding2d(padding, data_format)
+
+    channel_last = True if data_format == "NHWC" else False
+
+    padding, padding_algorithm = _update_padding_nd(
+        padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode)
 
     if in_dygraph_mode():
-        output = core.ops.pool2d(
-            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
-            False, 'padding_algorithm', padding_algorithm, 'strides', stride,
-            'paddings', pool_padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
-        if divisor_override is None:
-            return output
-        else:
-            check_instance(divisor_override, "divisor_override")
-            return output * (kernel_size[0] * kernel_size[1]) / divisor_override
+        output = core.ops.max_pool2d_with_index(
+            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
+            'paddings', padding, 'padding_algorithm', padding_algorithm,
+            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
+            'exclusive', True, 'data_format', data_format)
+        return output if return_indices else output[0]
 
-    op_type = 'pool2d'
+    op_type = 'max_pool2d_with_index'
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
 
     helper.append_op(
         type=op_type,
         inputs={"X": x},
-        outputs={"Out": pool_out},
+        outputs=outputs,
         attrs={
-            "pooling_type": "avg",
+            "pooling_type": 'max',
             "ksize": kernel_size,
             "global_pooling": False,
             "strides": stride,
-            "paddings": pool_padding,
+            "paddings": padding,
             "padding_algorithm": padding_algorithm,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not count_include_pad,
+            "exclusive": True,
             "data_format": data_format,
         })
 
-    if divisor_override is None:
-        return pool_out
-    else:
-        check_instance(divisor_override, "divisor_override")
-        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
+    return (pool_out, mask) if return_indices else pool_out
 
 
 def max_pool3d(x,
@@ -924,47 +735,25 @@ def max_pool3d(x,
                data_format="NCDHW",
                name=None):
     """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, D_{in}, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-           $$
-           \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, ksize[0]-1} \max_{m=0, \ldots, ksize[1]-1} \max_{n=0, \ldots, ksize[2]-1} \\
-                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
-                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
-           $$
-
+    This API implements max pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_MaxPool3d` .
     Args:
         x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of
-                          input tensor is `"NCDHW"` or `"NDHWC"`, where `N` is batch size, `C` is
-                          the number of channels, `D` is the depth of the feature,
-                          `H` is the height of the feature, and `W` is the width
-                          of the feature.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` or `"NDHWC"`, where N represents batch size, C represents the number of channels, D, H and W represent the depth, height and width of the feature respectively. 
+        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
             Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
         return_indices (bool): Whether to return the max indices along with the outputs.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
@@ -973,7 +762,6 @@ def max_pool3d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
     Raises:
@@ -986,23 +774,20 @@ def max_pool3d(x,
           import paddle.nn.functional as F
           import numpy as np
           paddle.disable_static()
-
           # max pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          output = F.max_pool2d(input,
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output = F.max_pool2d(x,
                                 kernel_size=2,
                                 stride=2, padding=0)
           output.shape [1, 3, 16, 16, 16]
-
           # for return_indices=True
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          output, max_indices = paddle.nn.functional.max_pool3d(input,
+          x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output, max_indices = paddle.nn.functional.max_pool3d(x,
                                         kernel_size = 2,
                                         stride = 2,
                                         padding=0,
                                         return_indices=True)
           # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
-
     """
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
     kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
@@ -1011,29 +796,10 @@ def max_pool3d(x,
     else:
         stride = utils.convert_to_list(stride, 3, 'pool_stride')
 
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0, 0]
+    channel_last = _channel_last(data_format, 3)
 
-    if data_format not in ["NCDHW", "NDHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s" % str(data_format))
-    padding = update_padding3d(padding, data_format)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
 
     if in_dygraph_mode():
         output = core.ops.max_pool3d_with_index(
@@ -1071,170 +837,83 @@ def max_pool3d(x,
     return (pool_out, mask) if return_indices else pool_out
 
 
-def avg_pool3d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               ceil_mode=False,
-               count_include_pad=False,
-               divisor_override=None,
-               data_format="NCDHW",
-               name=None):
+def adaptive_avg_pool1d(x, output_size, name=None):
     """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
-
+    This API implements adaptive average pooling 1d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
+    
     Args:
-        input (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
-                          shape [N, C, D, H, W], where `N` is batch size, `C` is
-                          the number of channels, `D` is the depth of the feature,
-                          `H` is the height of the feature, and `W` is the width
-                          of the feature.
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
-            is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
-                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
         name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
     Returns:
-        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+            Tensor: The output tensor of adaptive average pooling result. The data type is same
+                      as input tensor.
     Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
+            ValueError: 'output_size' should be an integer or list or tuple with length as 1.
     Examples:
         .. code-block:: python
-          import paddle.fluid as fluid
-          import paddle
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-          # avg pool3d
-          pool3d = paddle.nn.functional.avg_pool3d(
-                                            input,
-                                            kernel_size = 2,
-                                            stride = 2,
-                                            padding=0)
-          # pool3d.shape: [1, 3, 16, 16, 16]
-    """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
-    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
-    if stride is None:
-        stride = kernel_size
-    else:
-        stride = utils.convert_to_list(stride, 3, 'pool_stride')
-
-    padding_algorithm = "EXPLICIT"
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0, 0, 0]
-            if ceil_mode != False:
-                raise ValueError(
-                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
-                    "Received ceil_mode: True.")
-        elif padding == "SAME":
-            padding_algorithm = "SAME"
-            padding = [0, 0, 0]
+              # average adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_average_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+    """
+    pool_type = 'avg'
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'adaptive_pool2d')
+    _check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
 
-    if data_format not in ["NCDHW", "NDHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s" % str(data_format))
-    padding = update_padding3d(padding, data_format)
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
 
+    l_type = "pool2d"
+    x = unsqueeze(x, [2])
     if in_dygraph_mode():
-        output = core.ops.pool3d(
-            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
-            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
-        if divisor_override is None:
-            return output
-        else:
-            check_instance(divisor_override, "divisor_override")
-            return output * (kernel_size[0] * kernel_size[1] *
-                             kernel_size[2]) / divisor_override
+        pool_out = core.ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
+                                   pool_size, 'adaptive', True)
+        return squeeze(pool_out, [2])
 
-    op_type = "pool3d"
-    helper = LayerHelper(op_type, **locals())
+    helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": pool_out}
 
+    outputs = {"Out": pool_out}
     helper.append_op(
-        type=op_type,
+        type=l_type,
         inputs={"X": x},
         outputs=outputs,
         attrs={
-            "pooling_type": 'avg',
-            "ksize": kernel_size,
-            "global_pooling": False,
-            "strides": stride,
-            "paddings": padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": not count_include_pad,
-            "data_format": data_format,
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
         })
 
-    if divisor_override is None:
-        return pool_out
-    else:
-        check_instance(divisor_override, "divisor_override")
-        return pool_out * (kernel_size[0] * kernel_size[1] *
-                           kernel_size[2]) / divisor_override
+    return squeeze(pool_out, [2])
 
 
 def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
     """
-
-    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .
-
-    For avg adaptive pool2d:
-
-    ..  math::
-
-       hstart &= floor(i * H_{in} / H_{out})
-
-       hend &= ceil((i + 1) * H_{in} / H_{out})
-
-       wstart &= floor(j * W_{in} / W_{out})
-
-       wend &= ceil((j + 1) * W_{in} / W_{out})
-
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+    This API implements adaptive average pooling 2d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .
 
     Args:
         x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor.
@@ -1248,16 +927,12 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
-
     Raises:
         ValueError: If `data_format` is not "NCHW" or "NHWC".
-
     Examples:
         .. code-block:: python
-
             # adaptive avg pool2d
             # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
             # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
@@ -1279,10 +954,10 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
             input_data = np.random.rand(2, 3, 32, 32)
             x = paddle.to_tensor(input_data)
             # x.shape is [2, 3, 32, 32]
-            pool_out = paddle.nn.functional.adaptive_avg_pool2d(
+            out = paddle.nn.functional.adaptive_avg_pool2d(
                             x = x,
                             output_size=[3, 3])
-            # pool_out.shape is [2, 3, 3, 3]
+            # out.shape is [2, 3, 3, 3]
     """
     if not in_dygraph_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
@@ -1337,28 +1012,8 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
 
 def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     """
-
-    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .
-
-    For avg adaptive pool3d:
-
-    ..  math::
-
-      dstart &= floor(i * D_{in} / D_{out})
-
-      dend &= ceil((i + 1) * D_{in} / D_{out})
-
-      hstart &= floor(j * H_{in} / H_{out})
-
-      hend &= ceil((j + 1) * H_{in} / H_{out})
-
-      wstart &= floor(k * W_{in} / W_{out})
-
-      wend &= ceil((k + 1) * W_{in} / W_{out})
-
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+    This API implements adaptive average pooling 3d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .
 
     Args:
         x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
@@ -1372,16 +1027,12 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-
     Returns:
         Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
-
     Raises:
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-
     Examples:
         .. code-block:: python
-
             # adaptive avg pool3d
             # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
             # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
@@ -1406,10 +1057,10 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
             input_data = np.random.rand(2, 3, 8, 32, 32)
             x = paddle.to_tensor(input_data)
             # x.shape is [2, 3, 8, 32, 32]
-            pool_out = paddle.nn.functional.adaptive_avg_pool3d(
+            out = paddle.nn.functional.adaptive_avg_pool3d(
                             x = x,
                             output_size=[3, 3, 3])
-            # pool_out.shape is [2, 3, 3, 3, 3]
+            # out.shape is [2, 3, 3, 3, 3]
     """
     if not in_dygraph_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
@@ -1461,3 +1112,257 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
         })
 
     return pool_out
+
+
+def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+    """
+    This API implements adaptive max pooling 1d operation.
+    See more details in :ref:`api_nn_pooling_AdaptiveMaxPool1d` .
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+                with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
+    Returns:
+            Tensor: The output tensor of adaptive pooling result. The data type is same
+                      as input tensor.
+    Raises:
+            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+    Examples:
+        .. code-block:: python
+              # max adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = max(input[:, :, lstart: lend])
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_max_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
+              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
+    """
+    pool_type = 'max'
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                             'adaptive_max_pool1d')
+    _check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
+
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+
+    l_type = 'max_pool2d_with_index'
+
+    x = unsqueeze(x, [2])
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+
+
+def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
+    """
+        This operation applies a 2D adaptive max pooling on input tensor.
+        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
+        Args:
+            x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
+            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
+            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+        Returns:
+            Tensor: The output tensor of adaptive max pool2d result. The data type is same as input tensor.
+        Examples:
+            .. code-block:: python
+              # max adaptive pool2d
+              # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
+              # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+              # of input data into m*n grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         for j in range(n):
+              #             hstart = floor(i * H / m)
+              #             hend = ceil((i + 1) * H / m)
+              #             wstart = floor(i * W / n)
+              #             wend = ceil((i + 1) * W / n)
+              #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+              #
+              import paddle
+              import numpy as np
+              paddle.disable_static()
+              input_data = np.random.rand(2, 3, 32, 32)
+              x = paddle.to_tensor(input_data)
+              # x.shape is [2, 3, 32, 32]
+              out = paddle.nn.functional.adaptive_max_pool2d(
+                            x = x,
+                            output_size=[3, 3])
+              # out.shape is [2, 3, 3, 3]
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_max_pool2d')
+    _check_input(x, 4)
+    #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool2d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool2d')
+
+    in_h, in_w = x.shape[2:4]
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 2, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_h
+        if output_size[1] == None:
+            output_size[1] = in_w
+
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
+        return pool_out if return_indices else pool_out[0]
+
+    l_type = 'max_pool2d_with_index'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        })
+    #return (pool_out, mask) if return_indices else pool_out
+    return pool_out
+
+
+def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
+    """
+        This operation applies a 3D adaptive max pooling on input tensor.
+        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
+        Args:
+            x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
+            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+        Returns:
+            Tensor: The output tensor of adaptive max pool3d result. The data type is same as input tensor.
+        Examples:
+            .. code-block:: python
+              # adaptive max pool3d
+              # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
+              # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+              # of input data into m*n grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(l):
+              #         for j in range(m):
+              #             for k in range(n):
+              #                 dstart = floor(i * D / l)
+              #                 dend = ceil((i + 1) * D / l)
+              #                 hstart = floor(i * H / m)
+              #                 hend = ceil((i + 1) * H / m)
+              #                 wstart = floor(i * W / n)
+              #                 wend = ceil((i + 1) * W / n)
+              #             output[:, :, i, j, k] = max(input[:, :, dstart: dend, hstart: hend, wstart: wend])
+              #
+              import paddle
+              import numpy as np
+              paddle.disable_static()
+              input_data = np.random.rand(2, 3, 8, 32, 32)
+              x = paddle.to_tensor(input_data)
+              # x.shape is [2, 3, 8, 32, 32]
+              out = paddle.nn.functional.adaptive_max_pool3d(
+                            x = x,
+                            output_size=[3, 3, 3])
+              # out.shape is [2, 3, 3, 3, 3]
+    """
+
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_max_pool3d')
+    _check_input(x, 5)
+    #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool3d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool3d')
+
+    in_l, in_h, in_w = x.shape[2:5]
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 3, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_l
+        if output_size[1] == None:
+            output_size[1] = in_h
+        if output_size[2] == None:
+            output_size[2] = in_w
+
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool3d_with_index(
+            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
+        return pool_out if return_indices else pool_out[0]
+
+    l_type = 'max_pool3d_with_index'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        })
+
+    return (pool_out, mask) if return_indices else pool_out
diff --git a/python/paddle/nn/functional/rnn.py b/python/paddle/nn/functional/rnn.py
index 520cf44360dc370052375c2c9ef3d0b00fbc05de..b7a97bc5aa303ca507cac37798a3625d498050e3 100644
--- a/python/paddle/nn/functional/rnn.py
+++ b/python/paddle/nn/functional/rnn.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define function of recurrent neural network  
+from paddle.fluid.layers.rnn import rnn, birnn
 
-__all__ = [
-    #       'gru_unit',
-    #       'lstm',
-    #       'lstm_unit'
-]
+__all__ = ['rnn', 'birnn']
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index b25350be601dd9e56d8268859b52a12d3745c44d..6eac15cd694e51c24f94f7686b6e63fa7c6cbf09 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -20,6 +20,7 @@ from . import conv
 from . import extension
 from . import activation
 from . import norm
+from . import rnn
 from . import vision
 from . import distance
 from . import transformer
@@ -30,6 +31,7 @@ from .conv import *
 from .extension import *
 from .activation import *
 from .norm import *
+from .rnn import *
 from .vision import *
 
 from .transformer import *
@@ -41,6 +43,7 @@ from .activation import Sigmoid  #DEFINE_ALIAS
 from .activation import LogSoftmax  #DEFINE_ALIAS
 from .activation import HSigmoid  #DEFINE_ALIAS
 from .common import BilinearTensorProduct  #DEFINE_ALIAS
+from .common import Bilinear  #DEFINE_ALIAS
 from .common import Pool2D  #DEFINE_ALIAS
 from .common import Pad2D  #DEFINE_ALIAS
 from .common import ReflectionPad1d  #DEFINE_ALIAS
@@ -57,20 +60,24 @@ from .common import Embedding  #DEFINE_ALIAS
 from .common import Linear  #DEFINE_ALIAS
 from .common import Flatten  #DEFINE_ALIAS
 from .common import UpSample  #DEFINE_ALIAS
+from .common import UpsamplingNearest2d  #DEFINE_ALIAS
+from .common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .common import Dropout  #DEFINE_ALIAS
 from .common import Dropout2D  #DEFINE_ALIAS
 from .common import Dropout3D  #DEFINE_ALIAS
 from .common import AlphaDropout  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
 from .pooling import AvgPool1d  #DEFINE_ALIAS
-from .pooling import MaxPool1d  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
 from .pooling import AvgPool2d  #DEFINE_ALIAS
-from .pooling import MaxPool2d  #DEFINE_ALIAS
 from .pooling import AvgPool3d  #DEFINE_ALIAS
+from .pooling import MaxPool1d  #DEFINE_ALIAS
+from .pooling import MaxPool2d  #DEFINE_ALIAS
 from .pooling import MaxPool3d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool2d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool3d  #DEFINE_ALIAS
 from .conv import Conv1d  #DEFINE_ALIAS
 from .conv import Conv2d  #DEFINE_ALIAS
 from .conv import Conv3d  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 6ce732d95addba1af10ae38506ba0969975ae95d..c38d6018a2500111280a482aa60d072e65e27742 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -144,13 +144,13 @@ class Hardshrink(layers.Layer):
     .. math::
 
         hardshrink(x)=
-            \left\{
-            \begin{aligned}
-            &x, & & if \ x > threshold \\
-            &x, & & if \ x < -threshold \\
-            &0, & & if \ others
-            \end{aligned}
-            \right.
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x > threshold \\\\
+            &x, & & if \\ x < -threshold \\\\
+            &0, & & if \\ others
+            \\end{aligned}
+            \\right.
 
     Parameters:
         threshold (float, optional): The value of threshold for hardthrink. Default is 0.5
@@ -165,14 +165,14 @@ class Hardshrink(layers.Layer):
 
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.disable_static()
+            paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-1, 0.3, 2.5]))
-        m = paddle.nn.Hardshrink()
-        out = m(x) # [-1., 0., 2.5]
+            x = paddle.to_tensor(np.array([-1, 0.3, 2.5]))
+            m = paddle.nn.Hardshrink()
+            out = m(x) # [-1., 0., 2.5]
     """
 
     def __init__(self, threshold=0.5, name=None):
@@ -559,8 +559,8 @@ class SELU(layers.Layer):
                  \\end{cases}
 
     Parameters:
-        scale (float, optional): The value of scale for SELU. Default is 1.0507009873554804934193349852946
-        alpha (float, optional): The value of alpha for SELU. Default is 1.6732632423543772848170429916717
+        scale (float, optional): The value of scale(must be greater than 1.0) for SELU. Default is 1.0507009873554804934193349852946
+        alpha (float, optional): The value of alpha(must be no less than zero) for SELU. Default is 1.6732632423543772848170429916717
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -598,15 +598,15 @@ class LeakyReLU(layers.Layer):
     """
     Leaky ReLU Activation.
 
-    .. math:
+    .. math::
 
         LeakyReLU(x)=
-            \left\{
-            \begin{aligned}
-            &x, & & if \ x >= 0 \\
-            &negative\_slope * x, & & otherwise \\
-            \end{aligned}
-            \right. \\
+            \\left\\{
+            \\begin{aligned}
+            &x, & & if \\ x >= 0 \\\\
+            &negative\_slope * x, & & otherwise \\\\
+            \\end{aligned}
+            \\right. \\\\
 
     Parameters:
         negative_slope (float, optional): Slope of the activation function at
@@ -1015,7 +1015,7 @@ class LogSoftmax(layers.Layer):
     .. math::
 
         Out[i, j] = log(softmax(x)) 
-                  = log(\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
+                  = log(\\frac{\exp(X[i, j])}{\\sum_j(exp(X[i, j])})
 
     Parameters:
         axis (int, optional): The axis along which to perform log_softmax
@@ -1032,26 +1032,26 @@ class LogSoftmax(layers.Layer):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
-
-        paddle.disable_static()
-
-        x = np.array([[[-2.0, 3.0, -4.0, 5.0],
-                        [3.0, -4.0, 5.0, -6.0],
-                        [-7.0, -8.0, 8.0, 9.0]],
-                        [[1.0, -2.0, -3.0, 4.0],
-                        [-5.0, 6.0, 7.0, -8.0],
-                        [6.0, 7.0, 8.0, 9.0]]])
-        m = paddle.nn.LogSoftmax()
-        x = paddle.to_tensor(x)
-        out = m(x)
-        # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
-        #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
-        #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
-        #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
-        #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
-        #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = np.array([[[-2.0, 3.0, -4.0, 5.0],
+                           [3.0, -4.0, 5.0, -6.0],
+                           [-7.0, -8.0, 8.0, 9.0]],
+                          [[1.0, -2.0, -3.0, 4.0],
+                           [-5.0, 6.0, 7.0, -8.0],
+                           [6.0, 7.0, 8.0, 9.0]]])
+            m = paddle.nn.LogSoftmax()
+            x = paddle.to_tensor(x)
+            out = m(x)
+            # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
+            #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
+            #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
+            #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
+            #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
+            #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
     """
 
     def __init__(self, axis=-1, name=None):
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 8a73cfb8ccda15505d8668eff6776aac387c134f..a1e6508c67d96e9f6cc077efe6e61d708674b057 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -16,7 +16,6 @@
 from ...fluid.dygraph import BilinearTensorProduct  #DEFINE_ALIAS
 from ...fluid.dygraph import Pool2D  #DEFINE_ALIAS
 from ...fluid.dygraph import Embedding  #DEFINE_ALIAS
-from ...fluid.dygraph import Linear  #DEFINE_ALIAS
 from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
 from ...fluid.dygraph import layers
 from .. import functional as F
@@ -29,6 +28,8 @@ __all__ = [
     'Linear',
     'UpSample',
     'Pad2D',
+    'UpsamplingNearest2d',
+    'UpsamplingBilinear2d',
     'ReflectionPad1d',
     'ReplicationPad1d',
     'ConstantPad1d',
@@ -47,6 +48,89 @@ __all__ = [
 ]
 
 
+class Linear(layers.Layer):
+    """
+    
+    Fully-connected linear transformation layer:
+
+    .. math::
+
+        Out = {XW + b}
+
+    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
+
+    Linear layer takes only one ``Tensor`` input.
+    The Linear layer multiplies input tensor with weight matrix and
+    produces an output Tensor of shape [N, *, `output_dim`],
+    where N is batch size and `*` means any number of additional dimensions.
+    If ``bias_attr`` is not None, a bias variable will be created and added to the output.
+
+    Parameters:
+        in_features(int): The number of input units in this layer.
+        out_features(int): The number of output units in this layer.
+        weight_attr(ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
+            weights(Parameter) of this layer. Default: None.
+        bias_attr(ParamAttr or list of ParamAttr, optional): The attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Attributes:
+        **weight** (Parameter): the learnable weights of this layer.
+
+        **bias** (Parameter or None): the learnable bias of this layer.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle import nn
+          import numpy as np
+
+          data = np.ones((3,1,2), np.float32)
+          place = paddle.CPUPlace()
+          paddle.disable_static(place)
+          data = paddle.to_tensor(data)
+          weight_attr=paddle.framework.ParamAttr(name="linear_weight", learning_rate=1.0,
+          trainable=False, regularizer=None, initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+          bias_attr=paddle.framework.ParamAttr(name="linear_bias", learning_rate=1.0,
+          trainable=False, regularizer=None, initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
+          linear = nn.Linear(2,2,weight_attr=weight_attr, bias_attr=bias_attr)
+          res = linear(data)  # [3 3 3 3 3 3]
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(Linear, self).__init__()
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self.name = name
+        self.weight = self.create_parameter(
+            shape=[in_features, out_features],
+            attr=self._weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.name = name
+
+    def forward(self, input):
+        out = F.linear(
+            x=input, weight=self.weight, bias=self.bias, name=self.name)
+        return out
+
+
 class UpSample(layers.Layer):
     """
     This op resizes a batch of images.
@@ -54,8 +138,7 @@ class UpSample(layers.Layer):
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
     and the resizing only applies on the three dimensions(depth, height and width).
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the
-    future and only use :attr:`out_shape` instead.
+
     Supporting resample methods:
         'linear' : Linear interpolation
         'bilinear' : Bilinear interpolation
@@ -85,7 +168,7 @@ class UpSample(layers.Layer):
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
     The linear interpolation is performed on three directions.
-    Align_corners and align_mode are optional parameters,the calculation method
+    align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
 
     Example:
@@ -183,16 +266,16 @@ class UpSample(layers.Layer):
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
     
     Parameters:
-        input (Variable): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
+        x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
-        size (list|tuple|Variable|None): Output shape of image resize
+        size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Variable|None): The multiplier for the input height or width. At
+        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
              least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearst', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
@@ -216,7 +299,7 @@ class UpSample(layers.Layer):
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
     Raises:
-        TypeError: size should be a list or tuple or Variable.
+        TypeError: size should be a list or tuple or Tensor.
         ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
                     'trilinear', 'bicubic', or 'nearest' currently.
         ValueError: 'linear' only support 3-D tensor.
@@ -234,16 +317,18 @@ class UpSample(layers.Layer):
     Examples:
         .. code-block:: python
             import paddle
+            import paddle.nn as nn
             import numpy as np
-            import paddle.fluid.dygraph as dg
-            upsample_op = paddle.nn.UpSample(size=[12,12])
+            paddle.disable_static()
+
             input_data = np.random.rand(2,3,6,10).astype("float32")
-            place = paddle.fluid.CPUPlace()
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                output = upsample_op(input=input)
-                print(output.shape)
-                # [2L, 3L, 12L, 12L]
+            upsample_out  = paddle.nn.UpSample(size=[12,12])
+
+            input = paddle.to_tensor(input_data)
+            output = upsample_out(x=input)
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+
     """
 
     def __init__(self,
@@ -251,8 +336,9 @@ class UpSample(layers.Layer):
                  scale_factor=None,
                  mode='nearest',
                  align_corners=False,
-                 align_mode=1,
-                 data_format='NCHW'):
+                 align_mode=0,
+                 data_format='NCHW',
+                 name=None):
         super(UpSample, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
@@ -260,16 +346,184 @@ class UpSample(layers.Layer):
         self.align_corners = align_corners
         self.align_mode = align_mode
         self.data_format = data_format
+        self.name = name
 
-    def forward(self, input):
+    def forward(self, x):
         out = F.interpolate(
-            input,
+            x,
             size=self.size,
             scale_factor=self.scale_factor,
             mode=self.mode,
             align_corners=self.align_corners,
             align_mode=self.align_mode,
-            data_format=self.data_format)
+            data_format=self.data_format,
+            name=self.name)
+
+        return out
+
+
+class UpsamplingNearest2d(layers.Layer):
+    """
+    This op upsamples a batch of images, using nearest neighbours' pixel values.
+    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
+    and the upsampling only applies on the two dimensions(height and width).
+
+    Nearest neighbor interpolation is to perform nearest neighbor interpolation
+    in both the 3rd dimension(in height direction) and the 4th dimension(in width
+    direction) on input tensor.
+    
+    For details of nearest neighbor interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
+    
+        x (Tensor): 4-D Tensor, its data type is float32, float64, or uint8,
+                          its data format is specified by :attr:`data_format`.
+        size (list|tuple|Tensor|None): Output shape of image resize
+             layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
+             Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
+             If a Tensor Variable, its dimensions size should be a 1.
+        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             Default: None. Has to match input size if it is a list.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
+            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+    Returns:
+        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
+    Raises:
+        TypeError: size should be a list or tuple or Tensor.
+        ValueError: 'nearest' only support 4-D tensor.
+        ValueError: One of size and scale_factor must not be None.
+        ValueError: size length should be 2 for input 4-D tensor.
+        ValueError: scale_factor should be greater than zero.
+        ValueError: data_format can only be 'NCHW', 'NHWC'.
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_data = np.random.rand(2,3,6,10).astype("float32")
+            upsample_out  = paddle.nn.UpsamplingNearest2d(size=[12,12])
+
+            input = paddle.to_tensor(input_data)
+            output = upsample_out(x=input)
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+
+    """
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 data_format='NCHW',
+                 name=None):
+        super(UpsamplingNearest2d, self).__init__()
+        self.size = size
+        self.scale_factor = scale_factor
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode='nearest',
+            align_corners=False,
+            align_mode=0,
+            data_format=self.data_format,
+            name=self.name)
+
+        return out
+
+
+class UpsamplingBilinear2d(layers.Layer):
+    """
+    This op upsamples a batch of images, using bilinear' pixel values.
+    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
+    and the upsampling only applies on the two dimensions(height and width).
+
+    Bilinear interpolation is an extension of linear interpolation for
+    interpolating functions of two variables (e.g. H-direction and
+    W-direction in this op) on a rectilinear 2D grid. The key idea is
+    to perform linear interpolation first in one direction, and then
+    again in the other direction.
+    
+    For details of bilinear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Bilinear_interpolation.
+    
+        x (Tensor): 4-D Tensor, its data type is float32, float64, or uint8,
+                          its data format is specified by :attr:`data_format`.
+        size (list|tuple|Tensor|None): Output shape of image resize
+             layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
+             Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
+             If a Tensor Variable, its dimensions size should be a 1.
+        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+             Default: None. Has to match input size if it is a list.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
+            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+    Returns:
+        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
+    Raises:
+        TypeError: size should be a list or tuple or Tensor.
+        ValueError: 'bilinear' only support 4-D tensor.
+        ValueError: One of size and scale_factor must not be None.
+        ValueError: size length should be 2 for input 4-D tensor.
+        ValueError: scale_factor should be greater than zero.
+        ValueError: data_format can only be 'NCHW', 'NHWC'.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_data = np.random.rand(2,3,6,10).astype("float32")
+            upsample_out  = paddle.nn.UpsamplingBilinear2d(size=[12,12])
+
+            input = paddle.to_tensor(input_data)
+            output = upsample_out(x=input)
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
+    """
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 data_format='NCHW',
+                 name=None):
+        super(UpsamplingBilinear2d, self).__init__()
+        self.size = size
+        self.scale_factor = scale_factor
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode='bilinear',
+            align_corners=True,
+            align_mode=0,
+            data_format=self.data_format,
+            name=self.name)
 
         return out
 
@@ -360,7 +614,9 @@ class Bilinear(layers.Layer):
     This layer performs bilinear on two inputs.
 
     .. math::
+
       out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,size-1
+
       out = out + b
 
     In this formula:
@@ -389,7 +645,7 @@ class Bilinear(layers.Layer):
         **bias** (Parameter): the learnable bias of this layer.
 
     Returns:
-       Variable: A 2-D Tensor of shape [batch_size, out_features].
+       Tensor: A 2-D Tensor of shape [batch_size, out_features].
 
     Examples:
        .. code-block:: python
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 7d0e59fb7575c9d15d28e88a462aed4ddba47fb9..4e342c00528a2c0115940bb7f695e1ed5b582382 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -99,7 +99,8 @@ class _ConvNd(layers.Layer):
                 raise ValueError("in_channels must be divisible by groups.")
 
             if padding_mode in {'reflect', 'replicate', 'circular'}:
-                _paired_padding = utils.convert_to_list(padding, 2, 'padding')
+                _paired_padding = utils.convert_to_list(padding, dims,
+                                                        'padding')
                 self._reversed_padding_repeated_twice = _reverse_repeat_list(
                     _paired_padding, 2)
 
@@ -318,62 +319,80 @@ class Conv2d(_ConvNd):
     output of the convolution, and the corresponding activation function is
     applied to the final result.
     For each input :math:`X`, the equation is:
-    .. math::
-        Out = \\sigma (W \\ast X + b)
+
+    ..  math::
+
+        Out = \sigma (W \\ast X + b)
+
     Where:
+
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    
     Parameters:
-        in_channels(int): The number of channels in the input image.
-        out_channels(int): The number of channels produced by convolution.
-        kernel_size (int|list|tuple): The size of convolution kernel.
-        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`on both sides 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` .
-        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: 1.
-        groups (int, optional): The groups number of the Conv2d Layer. According to grouped
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+            connected to the second half of the input channels. The default value is 1.
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d.
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+            is not set, the bias is initialized zero. The default value is None.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filter of this layer.
+
         **bias** (Parameter or None): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel_size[0] - 1) + 1))}{strides[0]} + 1 \\\\
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel_size[1] - 1) + 1))}{strides[1]} + 1
+
+        ..  math::
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
     Examples:
+
         .. code-block:: python
+
           import numpy as np
           import paddle
           import paddle.nn as nn
@@ -646,35 +665,29 @@ class ConvTranspose2d(_ConvNd):
     The details of convolution transpose layer, please refer to the following explanation and references
     `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
     For each input :math:`X`, the equation is:
-    .. math::
+
+    ..  math::
+
         Out = \sigma (W \\ast X + b)
+
     Where:
+
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+    
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
         kernel_size(int|list|uple): The kernel size. If kernel_size is a tuple,
             it must contain two integers, (kernel_size_H, kernel_size_W).
             Otherwise, the kernel will be a square.
-        output_padding(int|list|tuple, optional): Additional size added to one side
-            of each dimension in the output shape. Default: 0.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides 
@@ -682,9 +695,8 @@ class ConvTranspose2d(_ConvNd):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
         dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
@@ -694,29 +706,46 @@ class ConvTranspose2d(_ConvNd):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d_transpose.
+        bias_attr(ParamAttr|bool, optional): The attribute for the bias of conv2d_transpose.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filters of this layer.
+
         **bias** (Parameter or None): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
+
+        ..  math::
+
+           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel\_size[0] - 1) + 1
+
+           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel\_size[1] - 1) + 1
+
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] )
+
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+
     Examples:
+
        .. code-block:: python
+
           import numpy as np
           import paddle
           import paddle.nn as nn
@@ -791,66 +820,86 @@ class Conv3d(_ConvNd):
     provided, bias is added to the output of the convolution, and the
     corresponding activation function is applied to the final result.
     For each input :math:`X`, the equation is:
-    .. math::
+
+    ..  math::
+
         Out = \sigma (W \\ast X + b)
+
     In the above equation:
+
     * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
     Parameters:
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
-        kernel_size (int|list|tuple, optional): The size of the convolving kernel.
-        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
             contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
             stride_D = stride_H = stride_W = stride. The default value is 1.
-        padding (int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups (int, optional): The groups number of the Conv3d Layer. According to grouped
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. The default value is 1.
-        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
             :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. The default value is None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filters of this layer.
+
         **bias** (Parameter): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
+
+        ..  math::
+
+           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
+
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
+
     Examples:
+
         .. code-block:: python
+
           import numpy as np
           
           import paddle
@@ -936,17 +985,22 @@ class ConvTranspose3d(_ConvNd):
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
     For each input :math:`X`, the equation is:
-    .. math::
+    
+    ..  math::
+
         Out = \sigma (W \\ast X + b)
+
     In the above equation:
+
     * :math:`X`: Input value, a tensor with NCDHW format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
+
     **Note**:
+
           The conv_transpose3d can be seen as the backward of the conv3d. For conv3d, 
           when stride > 1, conv3d maps multiple input shape to the same output shape, 
           so for conv_transpose3d, when stride > 1, input shape maps multiple output shape.
@@ -957,6 +1011,7 @@ class ConvTranspose3d(_ConvNd):
           and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
           between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
           conv_transpose3d can compute the kernel size automatically.
+
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
@@ -985,11 +1040,11 @@ class ConvTranspose3d(_ConvNd):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             The default value is 1.
-        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
@@ -999,24 +1054,38 @@ class ConvTranspose3d(_ConvNd):
             filter_size, padding, and stride to calculate output_size.
             if output_size and filter_size are specified at the same time, They
             should follow the formula above. Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
+        data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
+
     Attribute:
+
         **weight** (Parameter): the learnable weights of filters of this layer.
+
         **bias** (Parameter): the learnable bias of this layer.
+
     Shape:
+
         - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
         - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
         Where
-        .. math::
-           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
-           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel_size[2] - 1) + 1 \\\\
+
+        ..  math::
+
+           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel\_size[0] - 1) + 1
+           
+           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel\_size[1] - 1) + 1
+           
+           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel\_size[2] - 1) + 1
+           
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
     Examples:
+
        .. code-block:: python
+
           import numpy as np
           import paddle
           import paddle.nn as nn
@@ -1024,7 +1093,7 @@ class ConvTranspose3d(_ConvNd):
           
           paddle.disable_static()
           x_var = paddle.to_tensor(x)
-          conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
+          conv = nn.ConvTranspose3d(4, 6, (3, 3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
           print(y_np.shape)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index de10e77eb1c000e66a7a914dc94ce39a6268bb61..a1c7d28a85e762ebb381c5f0075df1c7b00396f7 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -634,9 +634,12 @@ class KLDivLoss(fluid.dygraph.Layer):
             Default is ``'mean'``.
 
     Shape:
-      - input: (N, *) where * means, any number of additional dimensions.
-      - label: (N, *), same shape as input
-      - output: tensor with shape: (1) by default.
+
+        - input (Tensor): (N, *), where * means, any number of additional dimensions.
+
+        - label (Tensor): (N, *), same shape as input.
+
+        - output (Tensor): tensor with shape: [1] by default.
 
 
     Examples:
@@ -646,7 +649,7 @@ class KLDivLoss(fluid.dygraph.Layer):
             import numpy as np
             import paddle.nn as nn
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
             shape = (5, 20)
             x = np.random.uniform(-10, 10, shape).astype('float32')
@@ -654,26 +657,26 @@ class KLDivLoss(fluid.dygraph.Layer):
 
             # 'batchmean' reduction, loss shape will be [N]
             kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[5]
 
             # 'mean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='mean')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[1]
 
             # 'sum' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='sum')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[1]
 
             # 'none' reduction, loss shape is same with X shape
             kldiv_criterion = nn.KLDivLoss(reduction='none')
-            pred_loss = kldiv_criterion(paddle.to_variable(x),
-                                        paddle.to_variable(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             # shape=[5, 20]
     """
 
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index c7855b23bf6e6861326533e3cc93d7f7c5bd4ca2..4d25418579d74ae896f8ca590400a0a334047e93 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -27,6 +27,7 @@
 
 # TODO: define normalization api  
 
+import six
 from ...fluid.dygraph.nn import InstanceNorm
 
 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
@@ -36,7 +37,6 @@ from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
 
 from ...fluid.dygraph import layers
-
 from ...framework import get_default_dtype, set_default_dtype
 from ...fluid.framework import in_dygraph_mode
 
@@ -50,6 +50,7 @@ from ..functional import batch_norm, layer_norm, instance_norm
 import numpy as np
 import numbers
 import warnings
+from ...fluid.dygraph.base import no_grad
 
 __all__ = [
     'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
@@ -566,17 +567,28 @@ class _BatchNormBase(layers.Layer):
         param_shape = [num_features]
 
         # create parameter
-        self.weight = self.create_parameter(
-            attr=self._weight_attr,
-            shape=param_shape,
-            default_initializer=Constant(1.0))
-        self.weight.stop_gradient = (self._weight_attr is False) or (
-            self._weight_attr and self._weight_attr.learning_rate == 0.)
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
 
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=param_shape, is_bias=True)
-        self.bias.stop_gradient = (self._bias_attr is False) or (
-            self._bias_attr and self._bias_attr.learning_rate == 0.)
+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
 
         moving_mean_name = None
         moving_variance_name = None
@@ -611,6 +623,7 @@ class _BatchNormBase(layers.Layer):
         self._epsilon = epsilon
         self._fuse_with_relu = False
         self._track_running_stats = track_running_stats
+        self._name = name
 
     def _check_input_dim(self, input):
         raise NotImplementedError("BatchNorm Base error")
@@ -898,7 +911,7 @@ class BatchNorm3d(_BatchNormBase):
                 len(input.shape)))
 
 
-class SyncBatchNorm(layers.Layer):
+class SyncBatchNorm(_BatchNormBase):
     """
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
     It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
@@ -984,72 +997,16 @@ class SyncBatchNorm(layers.Layer):
 
     def __init__(self,
                  num_features,
-                 epsilon=1e-05,
                  momentum=0.9,
-                 track_running_stats=True,
+                 epsilon=1e-05,
                  weight_attr=None,
                  bias_attr=None,
                  data_format='NCHW',
+                 track_running_stats=True,
                  name=None):
-        super(SyncBatchNorm, self).__init__()
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._num_features = num_features
-        self._data_layout = data_format
-        self._momentum = momentum
-        self._epsilon = epsilon
-        self._track_running_stats = track_running_stats
-
-        if self._track_running_stats == False:
-            warnings.warn(
-                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
-            )
-
-        param_shape = [self._num_features]
-
-        # create parameter
-        if weight_attr == False:
-            self.weight = self.create_parameter(
-                attr=None, shape=param_shape, default_initializer=Constant(1.0))
-            self.weight.stop_gradient = True
-        else:
-            self.weight = self.create_parameter(
-                attr=self._weight_attr,
-                shape=param_shape,
-                default_initializer=Constant(1.0))
-            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        if bias_attr == False:
-            self.bias = self.create_parameter(
-                attr=None,
-                shape=param_shape,
-                default_initializer=Constant(0.0),
-                is_bias=True)
-            self.bias.stop_gradient = True
-        else:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr, shape=param_shape, is_bias=True)
-            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        self._mean = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._mean.stop_gradient = True
-
-        self._variance = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._variance.stop_gradient = True
+        super(SyncBatchNorm,
+              self).__init__(num_features, momentum, epsilon, weight_attr,
+                             bias_attr, data_format, track_running_stats, name)
 
     def forward(self, x):
         # create output
@@ -1063,7 +1020,7 @@ class SyncBatchNorm(layers.Layer):
         if in_dygraph_mode():
             attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                      "is_test", not self.training, "data_layout",
-                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
+                     self._data_format, "use_mkldnn", False, "fuse_with_relu",
                      False, "use_global_stats", False, 'trainable_statistics',
                      False)
             sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
@@ -1073,13 +1030,13 @@ class SyncBatchNorm(layers.Layer):
             return sync_batch_norm_out
 
         check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'BatchNorm')
+                                 'SyncBatchNorm')
 
         attrs = {
             "momentum": self._momentum,
             "epsilon": self._epsilon,
             "is_test": not self.training,
-            "data_layout": self._data_layout,
+            "data_layout": self._data_format,
             "use_mkldnn": False,
             "fuse_with_relu": False,
             "use_global_stats": False,
@@ -1112,3 +1069,45 @@ class SyncBatchNorm(layers.Layer):
         self._helper.append_op(
             type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
         return sync_batch_norm_out
+
+    @classmethod
+    def convert_sync_batchnorm(cls, layer):
+        """
+        Helper function to convert :class: `paddle.nn.BatchNorm*d` layers in the model to :class: `paddle.nn.SyncBatchNorm` layers.
+
+        Parameters:
+            layer(paddle.nn.Layer): model containing one or more `BatchNorm*d` layers.
+
+        Returns:
+            The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.
+
+        Examples:
+
+            .. code-block:: python
+                import paddle
+                import paddle.nn as nn
+
+                paddle.disable_static()
+                model = nn.Sequential(nn.Conv2d(3, 5, 3), nn.BatchNorm2d(5))
+                sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+        """
+        layer_output = layer
+        if isinstance(layer, _BatchNormBase):
+            layer_output = SyncBatchNorm(layer._num_features, layer._epsilon,
+                                         layer._momentum, layer._weight_attr,
+                                         layer._bias_attr, layer._data_format,
+                                         layer._name)
+
+            if layer._weight_attr != False and layer._bias_attr != False:
+                with no_grad():
+                    layer_output.weight = layer.weight
+                    layer_output.bias = layer.bias
+            layer_output._mean = layer._mean
+            layer_output._variance = layer._variance
+
+        for name, sublayer in layer.named_sublayers():
+            layer_output.add_sublayer(name,
+                                      cls.convert_sync_batchnorm(sublayer))
+        del layer
+        return layer_output
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 87fa0caec9ee287c42d8308d9da25c6d2fc9b911..6f6b567849732ff889db4507708758cd8eeab2a8 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -12,198 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-
-from ...fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
-from ...fluid.layers import utils
 from ...fluid.dygraph import layers
 from ...fluid.layer_helper import LayerHelper
 from .. import functional as F
 
 __all__ = [
-    'AdaptiveAvgPool2d',
-    'AdaptiveAvgPool3d',
     'AvgPool1d',
-    'maxPool1d',
-    'AdaptiveMaxPool1d',
-    'AdaptiveAvgPool1d',
     'AvgPool2d',
-    'MaxPool2d',
     'AvgPool3d',
+    'MaxPool1d',
+    'MaxPool2d',
     'MaxPool3d',
+    'AdaptiveAvgPool1d',
+    'AdaptiveAvgPool2d',
+    'AdaptiveAvgPool3d',
+    'AdaptiveMaxPool1d',
+    'AdaptiveMaxPool2d',
+    'AdaptiveMaxPool3d',
 ]
 
 
-class AdaptiveAvgPool2d(layers.Layer):
-    """
-
-    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-
-    For avg adaptive pool2d:
-
-    ..  math::
-
-       hstart &= floor(i * H_{in} / H_{out})
-
-       hend &= ceil((i + 1) * H_{in} / H_{out})
-
-       wstart &= floor(j * W_{in} / W_{out})
-
-       wend &= ceil((j + 1) * W_{in} / W_{out})
-
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
-
-
-    Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two element, (H, W). H and W can be either a int, or None which means
-            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
-            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
-            the order of: [batch_size, input_channels, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-    Shape:
-        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32 or float64.
-        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
-
-    Returns:
-        A callable object of AdaptiveAvgPool2d.
-
-    Examples:
-        .. code-block:: python
-
-            # adaptive avg pool2d
-            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
-            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
-            # of input data into m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(m):
-            #         for j in range(n):
-            #             hstart = floor(i * H / m)
-            #             hend = ceil((i + 1) * H / m)
-            #             wstart = floor(i * W / n)
-            #             wend = ceil((i + 1) * W / n)
-            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
-            #
-            import paddle
-            import numpy as np
-            paddle.disable_static()
-            input_data = np.random.rand(2, 3, 32, 32)
-            x = paddle.to_tensor(input_data)
-            # x.shape is [2, 3, 32, 32]
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
-            pool_out = adaptive_avg_pool(x = x)
-            # pool_out.shape is [2, 3, 3, 3]
-    """
-
-    def __init__(self, output_size, data_format="NCHW", name=None):
-        super(AdaptiveAvgPool2d, self).__init__()
-        self._output_size = output_size
-        self._data_format = data_format
-        self._name = name
-
-    def forward(self, x):
-        return F.adaptive_avg_pool2d(
-            x,
-            output_size=self._output_size,
-            data_format=self._data_format,
-            name=self._name)
-
-
-class AdaptiveAvgPool3d(layers.Layer):
-    """
-
-    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size.
-
-    For avg adaptive pool3d:
-
-    ..  math::
-
-      dstart &= floor(i * D_{in} / D_{out})
-
-      dend &= ceil((i + 1) * D_{in} / D_{out})
-
-      hstart &= floor(j * H_{in} / H_{out})
-
-      hend &= ceil((j + 1) * H_{in} / H_{out})
-
-      wstart &= floor(k * W_{in} / W_{out})
-
-      wend &= ceil((k + 1) * W_{in} / W_{out})
-
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-
-
-    Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
-            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
-            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
-            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-    Shape:
-        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32 or float64.
-        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
-
-    Returns:
-        A callable object of AdaptiveAvgPool3d.
-
-    Examples:
-        .. code-block:: python
-
-            # adaptive avg pool3d
-            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
-            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
-            # of input data into l * m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(l):
-            #         for j in range(m):
-            #             for k in range(n):
-            #                 dstart = floor(i * D / l)
-            #                 dend = ceil((i + 1) * D / l)
-            #                 hstart = floor(j * H / m)
-            #                 hend = ceil((j + 1) * H / m)
-            #                 wstart = floor(k * W / n)
-            #                 wend = ceil((k + 1) * W / n)
-            #                 output[:, :, i, j, k] =
-            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
-            import paddle
-            import numpy as np
-            paddle.disable_static()
-            input_data = np.random.rand(2, 3, 8, 32, 32)
-            x = paddle.to_tensor(input_data)
-            # x.shape is [2, 3, 8, 32, 32]
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
-            pool_out = adaptive_avg_pool(x = x)
-            # pool_out = [2, 3, 3, 3, 3]
-    """
-
-    def __init__(self, output_size, data_format="NCDHW", name=None):
-        super(AdaptiveAvgPool3d, self).__init__()
-        self._output_size = output_size
-        self._data_format = data_format
-        self._name = name
-
-    def forward(self, x):
-        return F.adaptive_avg_pool3d(
-            x,
-            output_size=self._output_size,
-            data_format=self._data_format,
-            name=self._name)
-
-
 class AvgPool1d(layers.Layer):
     """
     This operation applies a 1D average pooling over an input signal composed
@@ -223,17 +51,20 @@ class AvgPool1d(layers.Layer):
 
     Args:
         kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+            it must contain an integer.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
-            then the input is implicitly zero-padded on both sides for padding number of points.
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
+                          mode, default is `True`.
         ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
-            If it is set to False, the floor function will be used. Default False
+            If it is set to False, the floor function will be used. The default value is False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -245,10 +76,14 @@ class AvgPool1d(layers.Layer):
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
+        ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
 
 
+    Shape:
+        - inpuut: 3-D tensor.
+        - output: 3-D tensor
+
     Examples:
 
         .. code-block:: python
@@ -284,63 +119,74 @@ class AvgPool1d(layers.Layer):
         return out
 
 
-class MaxPool1d(layers.Layer):
+class AvgPool2d(layers.Layer):
     """
-    Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-
-    The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
-    For average pool1d:
+    This operation applies 2D average pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
 
-    ..  math::
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
 
-       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           $$
 
     Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one integers.
+       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain one integers.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be the following forms: `[pad_left, pad_right]`.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
-            If it is set to False, the floor function will be used. Default False
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
 
-    Returns:
-        None.
+    Shape:
+        - x: 4-D tensor.
+        - out: 2-D tensor
 
+    Returns: None.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
         ShapeError: If the output's shape calculated is not greater than 0.
-
-
     Examples:
-
         .. code-block:: python
-
           import paddle
           import paddle.nn as nn
+          import numpy as np
           paddle.disable_static()
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
-          pool_out = MaxPool1d(data)
-          # pool_out shape: [1, 3, 16]
-
-          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
-          pool_out, indices = MaxPool1d(data)
-          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          AvgPool2d = nn.AvgPool2d(kernel_size=2,
+                                stride=2, padding=0)
+          output = AvgPoo2d(input)
+          # output.shape [1, 3, 16, 16]
 
     """
 
@@ -348,113 +194,155 @@ class MaxPool1d(layers.Layer):
                  kernel_size,
                  stride=None,
                  padding=0,
-                 return_indices=False,
                  ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCHW",
                  name=None):
-        super(MaxPool1d, self).__init__()
-        self.kernel_size = kernel_size
+        super(AvgPool2d, self).__init__()
+        self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
         self.ceil_mode = ceil_mode
-        self.return_indices = return_indices
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
         self.name = name
 
-    def forward(self, input):
-        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
-                           self.return_indices, self.ceil_mode, self.name)
-        return out
+    def forward(self, x):
+        return F.avg_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
 
 
-class AdaptiveAvgPool1d(layers.Layer):
+class AvgPool3d(layers.Layer):
     """
-
-    This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
-
-    For average adaptive pool1d:
-
-    ..  math::
-
-       lstart &= floor(i * L_{in} / L_{out})
-
-       lend &= ceil((i + 1) * L_{in} / L_{out})
-
-       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
     Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one int.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
 
-    Returns:
-        None.
-
+    Returns: None.
     Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 5-D tensor.
+        - out: 5-D tensor.
 
     Examples:
         .. code-block:: python
-
-          # average adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-          #
           import paddle
           import paddle.nn as nn
+          import numpy as np
           paddle.disable_static()
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
-          pool_out = AdaptiveAvgPool1d(data)
-          # pool_out shape: [1, 3, 16]
+          # avg pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+          AvgPool3d = nn.AvgPool3d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = AvgPool3d(input)
+          # output.shape [1, 2, 3, 16, 16]
+
     """
 
-    def __init__(self, output_size, name=None):
-        super(AdaptiveAvgPool1d, self).__init__()
-        self.output_size = output_size
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding=0,
+                 ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCDHW",
+                 name=None):
+        super(AvgPool3d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
         self.name = name
 
-    def forward(self, input):
-        return F.adaptive_avg_pool1d(input, self.output_size, self.name)
+    def forward(self, x):
+        return F.avg_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
 
 
-class AdaptiveMaxPool1d(layers.Layer):
+class MaxPool1d(layers.Layer):
     """
-
-    This operation applies a 1D adaptive max pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
+    Applies a 1D max pooling over an input signal composed of several input planes based
+    on the input, output_size, return_indices parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
     size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
 
-    For max adaptive pool1d:
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
 
     ..  math::
 
-       lstart &= floor(i * L_{in} / L_{out})
-
-       lend &= ceil((i + 1) * L_{in} / L_{out})
-
-       Output(i) &= max(Input[lstart:lend])}
+       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
 
     Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-             it must contain one int.
-        return_indices (bool): If true, the index of max pooling point will be returned along
-            with outputs. It cannot be set in average pooling type. Default False.
+       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain an integer.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An integer, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -462,53 +350,60 @@ class AdaptiveMaxPool1d(layers.Layer):
         None.
 
     Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Shape:
+        - x: 3-D tensor.
+        - out: 3-D tensor.
 
     Examples:
+
         .. code-block:: python
 
-          # max adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = max(input[:, :, lstart: lend])
-          #
-                    import paddle
+          import paddle
           import paddle.nn as nn
           paddle.disable_static()
 
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
-          pool_out = AdaptiveMaxPool1d(data)
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
+          pool_out = MaxPool1d(data)
           # pool_out shape: [1, 3, 16]
 
-          # for return_indices = true
-          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
-          pool_out, indices = AdaptiveMaxPool1d(data)
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
+          pool_out, indices = MaxPool1d(data)
           # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
-    def __init__(self, output_size, return_indices=False, name=None):
-        super(AdaptiveMaxPool1d, self).__init__()
-        self.output_size = output_size
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 return_indices=False,
+                 ceil_mode=False,
+                 name=None):
+        super(MaxPool1d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
         self.return_indices = return_indices
         self.name = name
 
     def forward(self, input):
-        return F.adaptive_max_pool1d(input, self.output_size,
-                                     self.return_indices, self.name)
+        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
+                           self.return_indices, self.ceil_mode, self.name)
+        return out
 
 
-class AvgPool2d(layers.Layer):
+class MaxPool2d(layers.Layer):
     """
-    This operation applies 2D average pooling over input features based on the input,
+    This operation applies 2D max pooling over input feature based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
     in NCHW format, where N is batch size, C is the number of channels,
     H is the height of the feature, and W is the width of the feature.
@@ -522,8 +417,9 @@ class AvgPool2d(layers.Layer):
       Output:
            Out shape: $(N, C, H_{out}, W_{out})$
            $$
-           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
            $$
 
     Args:
@@ -532,31 +428,33 @@ class AvgPool2d(layers.Layer):
             Otherwise, the pool kernel size will be a square of an int.
         stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
             it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
+            Otherwise, the pool stride size will be a square of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        return_indices (bool): Whether to return the max indices along with the outputs.
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
 
-    Returns: None.
+    Returns: None
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 4-D tensor.
+        - out: 4-D tensor.
+
     Examples:
         .. code-block:: python
           import paddle
@@ -566,172 +464,72 @@ class AvgPool2d(layers.Layer):
 
           # max pool2d
           input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          AvgPool2d = nn.AvgPool2d(kernel_size=2,
-                                stride=2, padding=0)
-          output = AvgPoo2d(input)
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = MaxPool2d(input)
           # output.shape [1, 3, 16, 16]
 
+          # for return_indices=True
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool2d(input)
+          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
 
     def __init__(self,
                  kernel_size,
                  stride=None,
                  padding=0,
+                 return_indices=False,
                  ceil_mode=False,
-                 count_include_pad=True,
-                 divisor_override=None,
                  data_format="NCHW",
                  name=None):
-        super(AvgPool2d, self).__init__()
+        super(MaxPool2d, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
+        self.return_indices = return_indices
         self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
-        self.divisor = divisor_override
         self.data_format = data_format
         self.name = name
 
     def forward(self, x):
-        return F.avg_pool2d(
+        return F.max_pool2d(
             x,
             kernel_size=self.ksize,
             stride=self.stride,
             padding=self.padding,
-            ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
-            divisor_override=self.divisor,
+            return_indices=self.return_indices,
             data_format=self.data_format,
             name=self.name)
 
 
-class MaxPool2d(layers.Layer):
+class MaxPool3d(layers.Layer):
     """
-    This operation applies 2D max pooling over input feature based on the input,
+    This operation applies 3D max pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-
-    Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
-                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                                                   \text{stride[1]} \times w + n)
-           $$
-
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
-        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
-            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Otherwise, the pool padding size will be a square of an int.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-    Returns: None
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
-    Examples:
-        .. code-block:: python
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-          paddle.disable_static()
-
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          MaxPool2d = nn.MaxPool2d(kernel_size=2,
-                                   stride=2, padding=0)
-          output = MaxPool2d(input)
-          # output.shape [1, 3, 16, 16]
-
-          # for return_indices=True
-          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
-          output, max_indices = MaxPool2d(input)
-          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
-    """
-
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 return_indices=False,
-                 ceil_mode=False,
-                 data_format="NCHW",
-                 name=None):
-        super(MaxPool2d, self).__init__()
-        self.ksize = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.return_indices = return_indices
-        self.ceil_mode = ceil_mode
-        self.data_format = data_format
-        self.name = name
-
-    def forward(self, x):
-        return F.max_pool2d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            return_indices=self.return_indices,
-            data_format=self.data_format,
-            name=self.name)
-
-
-class MaxPool3d(layers.Layer):
-    """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
     Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int. Default kernel_size.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): when True, will use ceil instead of floor to compute the output shape.
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -742,6 +540,11 @@ class MaxPool3d(layers.Layer):
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
+
+    Shape:
+        - x: 5-D tensor.
+        - out: 5-D tensor.
+
     Examples:
         .. code-block:: python
           import paddle
@@ -790,88 +593,457 @@ class MaxPool3d(layers.Layer):
             name=self.name)
 
 
-class AvgPool3d(layers.Layer):
+class AdaptiveAvgPool1d(layers.Layer):
     """
-    This operation applies 3D max pooling over input features based on the input,
-    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
-    in NCDHW format, where N is batch size, C is the number of channels,
-    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    This operation applies a 1D adaptive average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For average adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
 
     Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
-            is a tuple or list, it must contain three integers,
-            (pool_size_Depth, pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be the cube of an int.
-        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
-            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
-            Otherwise, the pool stride size will be a cube of an int.
-        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-        ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one int.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
 
-    Returns: None.
+    Returns:
+        None.
+
     Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Shape:
+        - x: 3-D tensor.
+        - out: 3-D tensor.
+
     Examples:
         .. code-block:: python
+
+          # average adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+          #
           import paddle
           import paddle.nn as nn
-          import numpy as np
           paddle.disable_static()
 
-          # avg pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          AvgPool3d = nn.AvgPool3d(kernel_size=2,
-                                   stride=2, padding=0)
-          output = AvgPool3d(input)
-          # output.shape [1, 2, 3, 16, 16]
-
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
+          pool_out = AdaptiveAvgPool1d(data)
+          # pool_out shape: [1, 3, 16]
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride,
-                 padding=0,
-                 ceil_mode=False,
-                 count_include_pad=True,
-                 divisor_override=None,
-                 data_format="NCDHW",
-                 name=None):
-        super(AvgPool3d, self).__init__()
-        self.ksize = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
-        self.divisor = divisor_override
-        self.data_format = data_format
+    def __init__(self, output_size, name=None):
+        super(AdaptiveAvgPool1d, self).__init__()
+        self.output_size = output_size
         self.name = name
 
-    def forward(self, x):
-        return F.avg_pool3d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
-            divisor_override=self.divisor,
-            data_format=self.data_format,
-            name=self.name)
+    def forward(self, input):
+        return F.adaptive_avg_pool1d(input, self.output_size, self.name)
+
+
+class AdaptiveAvgPool2d(layers.Layer):
+    """
+
+    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two element, (H, W). H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
+            the order of: [batch_size, input_channels, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool2d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out.shape is [2, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCHW", name=None):
+        super(AdaptiveAvgPool2d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool2d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AdaptiveAvgPool3d(layers.Layer):
+    """
+
+    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
+            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool3d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 8, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out = [2, 3, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCDHW", name=None):
+        super(AdaptiveAvgPool3d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool3d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AdaptiveMaxPool1d(layers.Layer):
+    """
+
+    This operation applies a 1D adaptive max pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For max adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= max(Input[lstart:lend])}
+
+    Args:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+             it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Returns:
+        None.
+
+    Raises:
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type is same as input x.
+
+    Examples:
+        .. code-block:: python
+
+          # max adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = max(input[:, :, lstart: lend])
+          #
+                    import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
+          pool_out = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+          # for return_indices = true
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
+          pool_out, indices = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool1d, self).__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
+        self.name = name
+
+    def forward(self, input):
+        return F.adaptive_max_pool1d(input, self.output_size,
+                                     self.return_indices, self.name)
+
+
+class AdaptiveMaxPool2d(layers.Layer):
+    """
+    This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    For adaptive max pool2d:
+    ..  math::
+       hstart &= floor(i * H_{in} / H_{out})
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+       wstart &= floor(j * W_{in} / W_{out})
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+       Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
+        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x.
+    
+    Returns:
+        A callable object of AdaptiveMaxPool2d.
+    Examples:
+        .. code-block:: python
+            # adaptive max pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=3, return_indices=True)
+            pool_out, indices = adaptive_max_pool(x = x)
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool2d, self).__init__()
+        self._output_size = output_size
+        self._return_indices = return_indices
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_max_pool2d(
+            x,
+            output_size=self._output_size,
+            return_indices=self._return_indices,
+            name=self._name)
+
+
+class AdaptiveMaxPool3d(layers.Layer):
+    """
+   This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    For adaptive max pool3d:
+    ..  math::
+      dstart &= floor(i * D_{in} / D_{out})
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+      hstart &= floor(j * H_{in} / H_{out})
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+      wstart &= floor(k * W_{in} / W_{out})
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+      Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+        output (Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type is same as input x.
+    Returns:
+        A callable object of AdaptiveMaxPool3d.
+    Examples:
+        .. code-block:: python
+            # adaptive max pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     max(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            pool = paddle.nn.AdaptiveMaxPool3d(output_size=4)
+            out = pool(x)
+            # out shape: [2, 3, 4, 4, 4]
+            pool, indices = paddle.nn.AdaptiveMaxPool3d(output_size=3, return_indices=True)
+            out = pool(x)
+            # out shape: [2, 3, 4, 4, 4], indices shape: [2, 3, 4, 4, 4]
+            
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool3d, self).__init__()
+        self._output_size = output_size
+        self._return_indices = return_indices
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_max_pool3d(
+            x,
+            output_size=self._output_size,
+            return_indices=self._return_indices,
+            name=self._name)
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 4717609503f7faafc16d8c15e3d404b0d780c3e1..6f1c5f199ac99692840ad3c5cffdb726baf5fa19 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -12,10 +12,1333 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define classes of recurrent neural network  
+import copy
+import collections
+import itertools
+import six
+import math
+import sys
+import warnings
+from functools import partial, reduce
+
+import paddle
+from paddle import framework
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+from paddle.fluid.dygraph import Layer, LayerList
+from paddle.fluid.layers import utils
+from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
+from paddle.fluid.data_feeder import convert_dtype
 
 __all__ = [
-    #       'RNNCell',
-    #       'GRUCell',
-    #       'LSTMCell'
+    'RNNCellBase',
+    'SimpleRNNCell',
+    'LSTMCell',
+    'GRUCell',
+    'RNN',
+    'BiRNN',
+    'SimpleRNN',
+    'LSTM',
+    'GRU',
 ]
+
+
+def split_states(states, bidirectional=False, state_components=1):
+    r"""
+    Split states of RNN network into possibly nested list or tuple of
+    states of each RNN cells of the RNN network.
+
+    Arguments:
+        states (Tensor|tuple|list): the concatenated states for RNN network.
+            When `state_components` is 1, states in a Tensor with shape
+            `(L*D, N, C)` where `L` is the number of layers of the RNN 
+            network, `D` is the number of directions of the RNN network(1 
+            for unidirectional RNNs and 2 for bidirectional RNNs), `N` is 
+            the batch size of the input to the RNN network, `C` is the 
+            hidden size of the RNN network. 
+
+            When `state_components` is larger than 1, `states` is a tuple of 
+            `state_components` Tensors that meet the requirements described 
+            above. 
+            
+            For SimpleRNNs and GRUs, `state_components` is 1, and for LSTMs, 
+            `state_components` is 2.
+        bidirectional (bool): whether the state is of a bidirectional RNN 
+            network. Defaults to False.
+        state_components (int): the number of the components of the states. see
+            `states` above. Defaults to 1.
+    
+    Returns:
+        A nested list or tuple of RNN cell states. 
+        If `bidirectional` is True, it can be indexed twice to get an RNN 
+        cell state. The first index indicates the layer, the second index 
+        indicates the direction.
+        If `bidirectional` is False, it can be indexed once to get an RNN
+        cell state. The index indicates the layer.
+        Note that if `state_components` is larger than 1, an RNN cell state
+        can be indexed one more time to get a tensor of shape(N, C), where 
+        `N` is the batch size of the input to the RNN cell, and `C` is the
+        hidden size of the RNN cell.
+    """
+    if state_components == 1:
+        states = paddle.unstack(states)
+        if not bidirectional:
+            return states
+        else:
+            return list(zip(states[::2], states[1::2]))
+    else:
+        assert len(states) == state_components
+        states = tuple([paddle.unstack(item) for item in states])
+        if not bidirectional:
+            return list(zip(*states))
+        else:
+            states = list(zip(*states))
+            return list(zip(states[::2], states[1::2]))
+
+
+def concat_states(states, bidirectional=False, state_components=1):
+    r"""
+    Concatenate a possibly nested list or tuple of RNN cell states into a 
+    compact form.
+
+    Arguments:
+        states (list|tuple): a possibly nested list or tuple of RNN cell 
+            states. 
+            If `bidirectional` is True, it can be indexed twice to get an 
+            RNN cell state. The first index indicates the layer, the second 
+            index indicates the direction.
+            If `bidirectional` is False, it can be indexed once to get an RNN
+            cell state. The index indicates the layer.
+            Note that if `state_components` is larger than 1, an RNN cell 
+            state can be indexed one more time to get a tensor of shape(N, C), 
+            where `N` is the batch size of the input to the RNN cell, and 
+            `C` is the hidden size of the RNN cell. 
+        bidirectional (bool): whether the state is of a bidirectional RNN 
+            network. Defaults to False.
+        state_components (int): the number of the components of the states. see
+            `states` above. Defaults to 1.
+    
+    Returns:
+        Concatenated states for RNN network.
+        When `state_components` is 1, states in a Tensor with shape
+        `(L\*D, N, C)` where `L` is the number of layers of the RNN 
+        network, `D` is the number of directions of the RNN network(1 for 
+        unidirectional RNNs and 2 for bidirectional RNNs), `N` is the batch 
+        size of the input to the RNN network, `C` is the hidden size of the 
+        RNN network.
+        
+    """
+    if state_components == 1:
+        return paddle.stack(flatten(states))
+    else:
+        states = flatten(states)
+        componnets = []
+        for i in range(state_components):
+            componnets.append(states[i::state_components])
+        return [paddle.stack(item) for item in componnets]
+
+
+class RNNCellBase(Layer):
+    r"""
+    RNNCellBase is the base class for abstraction representing the calculations
+    mapping the input and state to the output and new state. It is suitable to
+    and mostly used in RNN.
+    """
+
+    def get_initial_states(self,
+                           batch_ref,
+                           shape=None,
+                           dtype=None,
+                           init_value=0.,
+                           batch_dim_idx=0):
+        r"""
+        Generate initialized states according to provided shape, data type and
+        value.
+        Arguments:
+            batch_ref (Tensor): A tensor, which shape would be used to 
+                determine the batch size, which is used to generate initial 
+                states. For `batch_ref`'s shape d, `d[batch_dim_idx]` is 
+                treated as batch size.
+            shape (list|tuple, optional): A (possibly nested structure of) shape[s], 
+                where a shape is a list/tuple of integer). `-1` (for batch size) 
+                will be automatically prepended if a shape does not starts with 
+                it. If None, property `state_shape` will be used. Defaults to 
+                None.
+            dtype (str|list|tuple, optional): A (possibly nested structure of) 
+                data type[s]. The structure must be same as that of `shape`, 
+                except when all tensors' in states has the same data type, a 
+                single data type can be used. If None and property `cell.state_shape` 
+                is not available, current default floating type of paddle is 
+                used. Defaults to None.
+            init_value (float, optional): A float value used to initialize states. 
+                Defaults to 0.
+            batch_dim_idx (int, optional): An integer indicating which 
+                dimension of the of `batch_ref` represents batch. Defaults to 0.
+        Returns:
+            init_states (Tensor|tuple|list): tensor of the provided shape and 
+                dtype, or list of tensors that each satisfies the requirements,
+                packed in the same structure as `shape` and `type` does.
+        """
+        # TODO: use inputs and batch_size
+        batch_ref = flatten(batch_ref)[0]
+
+        def _is_shape_sequence(seq):
+            if sys.version_info < (3, ):
+                integer_types = (
+                    int,
+                    long, )
+            else:
+                integer_types = (int, )
+            """For shape, list/tuple of integer is the finest-grained objection"""
+            if (isinstance(seq, list) or isinstance(seq, tuple)):
+                if reduce(lambda flag, x: isinstance(x, integer_types) and flag,
+                          seq, True):
+                    return False
+            # TODO: Add check for the illegal
+            if isinstance(seq, dict):
+                return True
+            return (isinstance(seq, collections.Sequence) and
+                    not isinstance(seq, six.string_types))
+
+        class Shape(object):
+            def __init__(self, shape):
+                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
+
+        # nested structure of shapes
+        states_shapes = self.state_shape if shape is None else shape
+        is_sequence_ori = utils.is_sequence
+        utils.is_sequence = _is_shape_sequence
+        states_shapes = map_structure(lambda shape: Shape(shape), states_shapes)
+        utils.is_sequence = is_sequence_ori
+
+        # nested structure of dtypes
+        try:
+            states_dtypes = self.state_dtype if dtype is None else dtype
+        except NotImplementedError:
+            states_dtypes = framework.get_default_dtype()
+        if len(flatten(states_dtypes)) == 1:
+            dtype = flatten(states_dtypes)[0]
+            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
+
+        init_states = map_structure(
+            lambda shape, dtype: paddle.fluid.layers.fill_constant_batch_size_like(
+                input=batch_ref,
+                shape=shape.shape,
+                dtype=dtype,
+                value=init_value,
+                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
+        return init_states
+
+    @property
+    def state_shape(self):
+        r"""
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) shape[s], where a shape is a 
+        list/tuple of integers (-1 for batch size would be automatically
+        inserted into a shape if shape is not started with it).
+        Not necessary to be implemented if states are not initialized by
+        `get_initial_states` or the `shape` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_shape` in the used cell.")
+
+    @property
+    def state_dtype(self):
+        r"""
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) data types[s]. The structure must be
+        same as that of `shape`, except when all tensors' in states has the same
+        data type, a signle data type can be used.
+        Not necessary to be implemented if states are not initialized
+        by `get_initial_states` or the `dtype` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_dtype` in the used cell.")
+
+
+class SimpleRNNCell(RNNCellBase):
+    r"""
+    Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it 
+    computes the outputs and updates states.
+
+    The formula used is as follows:
+
+    .. math::
+        h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Please refer to `Finding Structure in Time 
+    <https://crl.ucsd.edu/~elman/Papers/fsit.pdf>`_ for more details.
+    
+    Arguments:
+        input_size (int): The input size.
+        hidden_size (int): The hidden size.
+        activation (str, optional): The activation in the SimpleRNN cell. 
+            It can be `tanh` or `relu`. Defaults to `tanh`.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih`. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_hh`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh`. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Parameters:
+        weight_ih (Parameter): shape (hidden_size, input_size), input to hidden 
+            weight, corresponding to :math:`W_{ih}` in the formula.
+        weight_hh (Parameter): shape (hidden_size, hidden_size), hidden to 
+            hidden weight, corresponding to :math:`W_{hh}` in the formula.
+        bias_ih (Parameter): shape (hidden_size, ), input to hidden bias, 
+            corresponding to :math:`b_{ih}` in the formula.
+        bias_hh (Parameter): shape (hidden_size, ), hidden to hidden bias, 
+            corresponding to :math:`b_{hh}` in the formula.
+    
+    Inputs:
+        inputs (Tensor): shape `[batch_size, input_size]`, the input, 
+                corresponding to :math:`x_t` in the formula.
+        states (Tensor, optional): shape `[batch_size, hidden_size]`, the
+            previous hidden state, corresponding to :math:`h_{t-1}` in the 
+            formula. When states is None, zero state is used. Defaults to 
+            None.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
+            state, corresponding to :math:`h_{t}` in the formula.
+    
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+            y, h = cell(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 activation="tanh",
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(SimpleRNNCell, self).__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = self.create_parameter(
+            (hidden_size, input_size),
+            weight_ih_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.weight_hh = self.create_parameter(
+            (hidden_size, hidden_size),
+            weight_hh_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = self.create_parameter(
+            (hidden_size, ),
+            bias_ih_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_hh = self.create_parameter(
+            (hidden_size, ),
+            bias_hh_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        if activation not in ["tanh", "relu"]:
+            raise ValueError(
+                "activation for SimpleRNNCell should be tanh or relu, "
+                "but get {}".format(activation))
+        self.activation = activation
+        self._activation_fn = paddle.tanh \
+            if activation == "tanh" \
+            else F.relu
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+        pre_h = states
+        i2h = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih is not None:
+            i2h += self.bias_ih
+        h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            h2h += self.bias_hh
+        h = self._activation_fn(i2h + h2h)
+        return h, h
+
+    @property
+    def state_shape(self):
+        return (self.hidden_size, )
+
+
+class LSTMCell(RNNCellBase):
+    r"""
+    Long-Short Term Memory(LSTM) RNN cell. Given the inputs and previous states, 
+    it computes the outputs and updates states.
+
+    The formula used is as follows:
+
+    .. math::
+        i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
+        f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
+        o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
+        \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
+        c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
+        h_{t} & = o_{t} \* \\tanh(c_{t})
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Arguments:
+        input_size (int): The input size.
+        hidden_size (int): The hidden size.
+        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_ih`. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_hh`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh`. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Parameters:
+        weight_ih (Parameter): shape (4 * hidden_size, input_size), input to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula.
+        weight_hh (Parameter): shape (4 * hidden_size, hidden_size), hidden to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{hi}, W_{hf}, W_{hg}, W_{ho}` in the formula.
+        bias_ih (Parameter): shape (4 * hidden_size, ), input to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{ii}, b_{if}, b_{ig}, b_{io}` in the formula.
+        bias_hh (Parameter): shape (4 * hidden_size, ), hidden to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{hi}, b_{hf}, b_{hg}, b_{ho}` in the formula.
+
+    Inputs:
+        inputs (Tensor): shape `[batch_size, input_size]`, the input, 
+            corresponding to :math:`x_t` in the formula.
+        states (tuple, optional): a tuple of two tensors, each of shape 
+            `[batch_size, hidden_size]`, the previous hidden state, 
+            corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. 
+            When states is None, zero state is used. Defaults to None.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (tuple): a tuple of two tensors, each of shape 
+            `[batch_size, hidden_size]`, the new hidden states,
+            corresponding to :math:`h_{t}, c{t}` in the formula.
+
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+            prev_c = paddle.randn((4, 32))
+
+            cell = paddle.nn.LSTMCell(16, 32)
+            y, (h, c) = cell(x, (prev_h, prev_c))
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(LSTMCell, self).__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = self.create_parameter(
+            (4 * hidden_size, input_size),
+            weight_ih_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.weight_hh = self.create_parameter(
+            (4 * hidden_size, hidden_size),
+            weight_hh_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = self.create_parameter(
+            (4 * hidden_size, ),
+            bias_ih_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_hh = self.create_parameter(
+            (4 * hidden_size, ),
+            bias_hh_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self._gate_activation = F.sigmoid
+        self._activation = paddle.tanh
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+        pre_hidden, pre_cell = states
+        gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih is not None:
+            gates = gates + self.bias_ih
+        gates += paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            gates = gates + self.bias_hh
+
+        chunked_gates = paddle.split(gates, num_or_sections=4, axis=-1)
+
+        i = self._gate_activation(chunked_gates[0])
+        f = self._gate_activation(chunked_gates[1])
+        o = self._gate_activation(chunked_gates[3])
+        c = f * pre_cell + i * self._activation(chunked_gates[2])
+        h = o * self._activation(c)
+
+        return h, (h, c)
+
+    @property
+    def state_shape(self):
+        r"""
+        The `state_shape` of LSTMCell is a tuple with two shapes: 
+        `((hidden_size, ), (hidden_size,))`. (-1 for batch size would be 
+        automatically inserted into shape). These two shapes correspond 
+        to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
+        """
+        return ((self.hidden_size, ), (self.hidden_size, ))
+
+
+class GRUCell(RNNCellBase):
+    r"""
+    Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, 
+    it computes the outputs and updates states.
+
+    The formula for GRU used is as follows:
+
+    .. math::
+
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
+        z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+        \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
+        h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Parameters:
+        input_size (int): The input size..
+        hidden_size (int): The hidden size.
+        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_ih`. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            `weight_hh`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih`. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh`. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Parameters:
+        weight_ih (Parameter): shape (3 * hidden_size, input_size), input to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{ir}, W_{iz}, W_{ic}` in the formula.
+        weight_hh (Parameter): shape (3 * hidden_size, hidden_size), hidden to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{hr}, W_{hz}, W_{hc}` in the formula.
+        bias_ih (Parameter): shape (3 * hidden_size, ), input to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{ir}, b_{iz}, b_{ic}` in the formula.
+        bias_hh (Parameter): shape (3 * hidden_size, ), hidden to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{hr}, b_{hz}, b_{hc}` in the formula.
+
+    Inputs:
+        inputs (Tensor): A tensor with shape `[batch_size, input_size]`,
+            corresponding to :math:`x_t` in the formula.
+        states (Tensor): A tensor with shape `[batch_size, hidden_size]`.
+            corresponding to :math:`h_{t-1}` in the formula.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
+            state, corresponding to :math:`h_{t}` in the formula.
+    
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.GRUCell(16, 32)
+            y, h = cell(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(GRUCell, self).__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = self.create_parameter(
+            (3 * hidden_size, input_size),
+            weight_ih_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.weight_hh = self.create_parameter(
+            (3 * hidden_size, hidden_size),
+            weight_hh_attr,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = self.create_parameter(
+            (3 * hidden_size, ),
+            bias_ih_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_hh = self.create_parameter(
+            (3 * hidden_size, ),
+            bias_hh_attr,
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self._gate_activation = F.sigmoid
+        self._activation = paddle.tanh
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+
+        pre_hidden = states
+        x_gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih is not None:
+            x_gates = x_gates + self.bias_ih
+        h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            h_gates = h_gates + self.bias_hh
+
+        x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
+        h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
+
+        r = self._gate_activation(x_r + h_r)
+        z = self._gate_activation(x_z + h_z)
+        c = self._activation(x_c + r * h_c)  # apply reset gate after mm
+        h = (pre_hidden - c) * z + c
+
+        return h, h
+
+    @property
+    def state_shape(self):
+        r"""
+        The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
+        size would be automatically inserted into shape). The shape corresponds
+        to the shape of :math:`h_{t-1}`.
+        """
+        return (self.hidden_size, )
+
+
+class RNN(Layer):
+    r"""
+    Wrapper for RNN, which creates a recurrent neural network with an RNN cell. 
+    It performs :code:`cell.forward()` repeatedly until reaches to the maximum 
+    length of `inputs`.
+
+    Arguments:
+        cell(RNNCellBase): An instance of `RNNCellBase`.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Defaults to False.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+
+    Inputs:
+        inputs (Tensor): A (possibly nested structure of) tensor[s]. The input 
+            sequences. 
+            If time major is True, the shape is `[batch_size, time_steps, input_size]`
+            If time major is False, the shape is [time_steps, batch_size, input_size]`
+            where `input_size` is the input size of the cell.
+        initial_states (Tensor|list|tuple, optional): Tensor of a possibly 
+            nested structure of tensors, representing the initial state for 
+            the rnn cell. If not provided, `cell.get_initial_states` would be 
+            called to produce the initial states. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        **kwargs: Additional keyword arguments to pass to `forward` of the cell. 
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor|list|tuple): the output sequences.
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, hidden_size]`, else 
+            `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor|list|tuple): final states of the cell. Tensor or 
+            a possibly nested structure of tensors which has the same structure 
+            with intial state. Each tensor in final states has the same shape 
+            and dtype as the corresponding tensor in initial states.
+    
+    Notes:
+        This class is a low level API for wrapping rnn cell into a RNN network.
+        Users should take care of the state of the cell. If `initial_states` is 
+        passed to the `forward` method, make sure that it satisfies the 
+        requirements of the cell.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            inputs = paddle.rand((4, 23, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+            rnn = paddle.nn.RNN(cell)
+            outputs, final_states = rnn(inputs, prev_h)
+
+    """
+
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            # for non-dygraph mode, `rnn` api uses cell.call
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if initial_states is None:
+            initial_states = self.cell.get_initial_states(
+                batch_ref=inputs,
+                dtype=inputs.dtype,
+                batch_dim_idx=self.batch_index)
+
+        final_outputs, final_states = F.rnn(self.cell,
+                                            inputs,
+                                            initial_states=initial_states,
+                                            sequence_length=sequence_length,
+                                            time_major=self.time_major,
+                                            is_reverse=self.is_reverse,
+                                            **kwargs)
+        return final_outputs, final_states
+
+
+class BiRNN(Layer):
+    r"""
+    Wrapper for bidirectional RNN, which builds a bidiretional RNN given the 
+    forward rnn cell and backward rnn cell. A BiRNN applies forward RNN and 
+    backward RNN with coresponding cells separately and concats the outputs 
+    along the last axis.
+
+    Arguments:
+        cell_fw (RNNCellBase): A RNNCellBase instance used for forward RNN.
+        cell_bw (RNNCellBase): A RNNCellBase instance used for backward RNN.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+
+    Inputs:
+        inputs (Tensor): the input sequences of both RNN. 
+            If time_major is True, the shape of is 
+            `[time_steps, batch_size, input_size]`, else the shape is
+            `[batch_size, time_steps, input_size]`, where input_size is the 
+            input size of both cells.
+        initial_states (list|tuple, optional): A tuple/list of the initial 
+            states of the forward cell and backward cell. Defaults to None.
+            If not provided, `cell.get_initial_states` would be called to 
+            produce the initial states for each cell. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        **kwargs: Additional keyword arguments. Arguments passed to `forward` 
+            for each cell.
+
+    Outputs:
+        (outputs, final_states)
+        outputs (Tensor): the outputs of the bidirectional RNN. It is the 
+            concatenation of the outputs from the forward RNN and backward 
+            RNN along the last axis. 
+            If time major is True, the shape is `[time_steps, batch_size, size]`,
+            else the shape is `[batch_size, time_steps, size]`, where size is
+            `cell_fw.hidden_size + cell_bw.hidden_size`.
+        final_states (tuple): A tuple of the final states of the forward 
+            cell and backward cell. 
+
+    Notes:
+        This class is a low level API for wrapping rnn cells into a BiRNN 
+        network. Users should take care of the states of the cells. 
+        If `initial_states` is passed to the `forward` method, make sure that 
+        it satisfies the requirements of the cells.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            cell_fw = paddle.nn.LSTMCell(16, 32)
+            cell_bw = paddle.nn.LSTMCell(16, 32)
+            rnn = paddle.nn.BiRNN(cell_fw, cell_bw)
+
+            inputs = paddle.rand((2, 23, 16))
+            outputs, final_states = rnn(inputs)
+
+    """
+
+    def __init__(self, cell_fw, cell_bw, time_major=False):
+        super(BiRNN, self).__init__()
+        self.cell_fw = cell_fw
+        self.cell_bw = cell_bw
+        if cell_fw.input_size != cell_bw.input_size:
+            raise ValueError("input size of forward cell({}) does not equals"
+                             "that of backward cell({})".format(
+                                 cell_fw.input_size, cell_bw.input_size))
+        for cell in [self.cell_fw, self.cell_bw]:
+            if not hasattr(cell, "call"):
+                # for non-dygraph mode, `rnn` api uses cell.call
+                cell.call = cell.forward
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if isinstance(initial_states, (list, tuple)):
+            assert len(initial_states) == 2, \
+                "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+
+        outputs, final_states = F.birnn(self.cell_fw, self.cell_bw, inputs,
+                                        initial_states, sequence_length,
+                                        self.time_major, **kwargs)
+        return outputs, final_states
+
+
+class RNNMixin(LayerList):
+    r"""
+    A Mixin class for RNN networks. It provides `forward` method for SimpleRNN,
+    LSTM and GRU.
+    """
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        dtype = inputs.dtype
+        if initial_states is None:
+            state_shape = (self.num_layers * self.num_directions, -1,
+                           self.hidden_size)
+            if self.state_components == 1:
+                initial_states = paddle.fluid.layers.fill_constant_batch_size_like(
+                    inputs, state_shape, dtype, 0, batch_index, 1)
+            else:
+                initial_states = tuple([
+                    paddle.fluid.layers.fill_constant_batch_size_like(
+                        inputs, state_shape, dtype, 0, batch_index, 1)
+                    for _ in range(self.state_components)
+                ])
+
+        states = split_states(initial_states, self.num_directions == 2,
+                              self.state_components)
+        final_states = []
+
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = F.dropout(
+                    inputs,
+                    self.dropout,
+                    training=self.training,
+                    mode="upscale_in_train")
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
+
+        final_states = concat_states(final_states, self.num_directions == 2,
+                                     self.state_components)
+        return outputs, final_states
+
+
+class SimpleRNN(RNNMixin):
+    r"""
+    Multilayer Elman network(SimpleRNN). It takes input sequences and initial 
+    states as inputs, and returns the output sequences and the final states.
+
+    Each layer inside the SimpleRNN maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`)
+    and new states(:math:`h_{t}`).
+
+    .. math::
+
+        h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        activation (str, optional): The activation in each SimpleRNN cell. It can be 
+            `tanh` or `relu`. Defaults to `tanh`.
+        direction (str, optional): The direction of the network. It can be "forward", 
+            "backward" and "bidirectional". Defaults to "forward".
+        dropout (float, optional): The droput probability. Dropout is applied to the 
+            input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih` of each cell. Defaults to None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_hh` of each cell. Defaults to None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih` of each cells. Defaults to None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh` of each cells. Defaults to None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (Tensor, optional): the initial state. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`,
+            else, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+        final_states (Tensor): final states. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            rnn = paddle.nn.SimpleRNN(16, 32, 2)
+
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            y, h = rnn(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 activation="tanh",
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(SimpleRNN, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = SimpleRNNCell(input_size, hidden_size, activation,
+                                 weight_ih_attr, weight_hh_attr, bias_ih_attr,
+                                 bias_hh_attr)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = SimpleRNNCell(hidden_size, hidden_size, activation,
+                                     weight_ih_attr, weight_hh_attr,
+                                     bias_ih_attr, bias_hh_attr)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = SimpleRNNCell(input_size, hidden_size, activation,
+                                    weight_ih_attr, weight_hh_attr,
+                                    bias_ih_attr, bias_hh_attr)
+            cell_bw = SimpleRNNCell(input_size, hidden_size, activation,
+                                    weight_ih_attr, weight_hh_attr,
+                                    bias_ih_attr, bias_hh_attr)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = SimpleRNNCell(
+                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
+                    weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = SimpleRNNCell(
+                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
+                    weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
+
+
+class LSTM(RNNMixin):
+    r"""
+    Multilayer LSTM. It takes a sequence and an initial state as inputs, and 
+    returns the output sequences and the final states.
+
+    Each layer inside the LSTM maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step 
+    outputs(:math:`y_{t}`) and new states(:math:`h_{t}, c_{t}`).
+
+    .. math::
+
+        i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
+        f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
+        o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
+        \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
+        c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
+        h_{t} & = o_{t} \* \\tanh(c_{t})
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        direction (str, optional): The direction of the network. It can be 
+            "forward", "backward" and "bidirectional". Defaults to "forward".
+        dropout (float, optional): The droput probability. Dropout is applied 
+            to the input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input 
+            means the time steps. Defaults to False.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih` of each cell. Default: None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_hh` of each cell. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih` of each cells. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh` of each cells. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (tuple, optional): the initial state, a tuple of (h, c), 
+            the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whos time step 
+            index are not less than the valid length are treated as paddings.
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`, 
+            If `time_major` is False, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`. 
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1. 
+        final_states (Tensor): the final state, a tuple of two tensors, h and c. 
+            The shape of each is 
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            rnn = paddle.nn.LSTM(16, 32, 2)
+
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            prev_c = paddle.randn((2, 4, 32))
+            y, (h, c) = rnn(x, (prev_h, prev_c))
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(LSTM, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                            weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = LSTMCell(hidden_size, hidden_size, weight_ih_attr,
+                                weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            cell_bw = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                   weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                   weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 2
+
+
+class GRU(RNNMixin):
+    r"""
+    Multilayer GRU. It takes input sequencse and initial states as inputs, and 
+    returns the output sequences and the final states.
+
+    Each layer inside the GRU maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`) 
+    and new states(:math:`h_{t}`).
+
+    .. math::
+
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
+        z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+        \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
+        h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        direction (str, optional): The direction of the network. It can be 
+            "forward", "backward" and "bidirectional". Defaults to "forward".
+        dropout (float, optional): The droput probability. Dropout is applied 
+            to the input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input 
+            means the time steps. Defaults to False.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_ih` of each cell. Default: None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            `weight_hh` of each cell. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_ih` of each cells. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            `bias_hh` of each cells. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (Tensor, optional): the initial state. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used. 
+            Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whos time step 
+            index are not less than the valid length are treated as paddings.
+
+    Returns:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`,
+            else, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+        final_states (Tensor): final states. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            rnn = paddle.nn.GRU(16, 32, 2)
+
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            y, h = rnn(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(GRU, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = GRUCell(input_size, hidden_size, weight_ih_attr,
+                           weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = GRUCell(hidden_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = GRUCell(input_size, hidden_size, weight_ih_attr,
+                              weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            cell_bw = GRUCell(input_size, hidden_size, weight_ih_attr,
+                              weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                  weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                  weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 49314c9832dd389411dffb3f498b34d09337a3f0..095a34cb6fc68cda6900790141d226208b203f82 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -26,9 +26,8 @@ __all__ = [
 ]
 
 
-from ..fluid.optimizer import  SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \
-            Ftrl, Adadelta, \
-            SGDOptimizer, MomentumOptimizer, AdagradOptimizer,DpsgdOptimizer,\
+from ..fluid.optimizer import Momentum, Adagrad, Dpsgd, DecayedAdagrad, Ftrl,\
+            AdagradOptimizer,DpsgdOptimizer,\
             DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
             ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
             ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
@@ -39,6 +38,9 @@ from .adam import Adam
 from .adamw import AdamW
 from .adamax import Adamax
 from .rmsprop import RMSProp
+from .adadelta import Adadelta
+from .sgd import SGD
+from .momentum import Momentum
 
 from . import lr_scheduler
 from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..bba2c11ea07490804573189bac8b315dfc80fd37
--- /dev/null
+++ b/python/paddle/optimizer/adadelta.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+
+__all__ = ["Adadelta"]
+
+
+class Adadelta(Optimizer):
+    """
+    **Notes: This API does not support sparse parameter optimization.**
+
+    Adadelta Optimizer. Please refer to this for details:
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.
+
+    The update is done as follows:
+
+    .. math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2
+
+        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) }
+
+        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
+
+    Args:
+	learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
+        rho (float): a floating point value indicating the decay rate. Default 0.95.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization. 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            adadelta = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            adadelta.step()
+            adadelta.clear_grad()
+
+    """
+
+    _avg_squared_grad_acc_str = "_avg_squared_grad"
+    _avg_squared_update_acc_str = "_avg_squared_update"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 epsilon=1.0e-6,
+                 rho=0.95,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        super(Adadelta, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "adadelta"
+        self._epsilon = epsilon
+        self._rho = rho
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._avg_squared_grad_acc_str, p)
+            self._add_accumulator(self._avg_squared_update_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        avg_squared_grad_acc = self._get_accumulator(
+            self._avg_squared_grad_acc_str, param_and_grad[0])
+        avg_squared_update_acc = self._get_accumulator(
+            self._avg_squared_update_acc_str, param_and_grad[0])
+
+        # Create the adadelta optimizer op
+        adadelta_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "AvgSquaredGrad": avg_squared_grad_acc,
+                "AvgSquaredUpdate": avg_squared_update_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "AvgSquaredGradOut": avg_squared_grad_acc,
+                "AvgSquaredUpdateOut": avg_squared_update_acc
+            },
+            attrs={"epsilon": self._epsilon,
+                   "rho": self._rho},
+            stop_gradient=True)
+
+        return adadelta_op
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
new file mode 100644
index 0000000000000000000000000000000000000000..87fa86c17615ef8cc455e95517608a246d677e74
--- /dev/null
+++ b/python/paddle/optimizer/momentum.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+
+__all__ = ["Momentum"]
+
+
+class Momentum(Optimizer):
+    """
+
+    Simple Momentum optimizer with velocity state
+
+    This optimizer has a flag for Nestrov Momentum.
+
+    The update equations are as follows:
+
+    .. math::
+
+        & velocity = mu * velocity + gradient
+
+        & if (use\_nesterov):
+
+        &\quad   param = param - (gradient + mu * velocity) * learning\_rate
+
+        & else:
+
+        &\quad   param = param - learning\_rate * velocity
+
+    Parameters:
+
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        momentum (float): Momentum factor. The default value is 0.9.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            momentum = paddle.optimizer.Momentum(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            momentum.step()
+            momentum.clear_grad()
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 momentum=0.9,
+                 parameters=None,
+                 use_nesterov=False,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set")
+        if momentum is None:
+            raise ValueError("momentum is not set")
+        super(Momentum, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "momentum"
+        self._momentum = momentum
+        self._use_nesterov = bool(use_nesterov)
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(self._velocity_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        lr = self._create_param_lr(param_and_grad)
+
+        if framework.in_dygraph_mode():
+            _, _ = core.ops.momentum(param_and_grad[0], param_and_grad[1],
+                                     velocity_acc, lr, param_and_grad[0],
+                                     velocity_acc, 'mu', self._momentum,
+                                     'use_nesterov', self._use_nesterov)
+            return None
+
+        attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov}
+        inputs = {
+            "Param": [param_and_grad[0]],
+            "Grad": [param_and_grad[1]],
+            "Velocity": [velocity_acc],
+            "LearningRate": [lr]
+        }
+
+        outputs = {
+            "ParamOut": [param_and_grad[0]],
+            "VelocityOut": [velocity_acc]
+        }
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True)
+
+        return momentum_op
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index cb602ff0b3754f82fb2ef9d8b78de18e46d56778..3f9de0cefc05d1aaee36fa3af5cfa9ae4affcb97 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -35,12 +35,12 @@ from ..fluid.layers import ops
 from ..fluid.regularizer import append_regularization_ops
 from ..fluid.dygraph import base as imperative_base
 from ..fluid.dygraph import no_grad
-from ..fluid.dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay
 from paddle.fluid import core
 from paddle.fluid.layers import tensor
 from functools import reduce
 from ..fluid.wrapped_decorator import signature_safe_contextmanager
 from .. import compat as cpt
+from .lr_scheduler import _LRScheduler
 
 __all__ = ['Optimizer']
 
@@ -53,8 +53,8 @@ class Optimizer(object):
     but need to use one of it's implementation.
 
     Args:
-        learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``.
-            It can be a float value or a LearningRateDecay.
+        learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
+            It can be a float value or any subclass of ``_LRScheduler`` .
         parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
@@ -109,11 +109,6 @@ class Optimizer(object):
             parameters) if parameters is not None else None
         self._name = name
         if framework.in_dygraph_mode():
-            if not isinstance(learning_rate, float) and \
-                    not isinstance(learning_rate, LearningRateDecay):
-                raise TypeError(
-                    "learning rate should be float or LearningRateDecay, got %s here"
-                    % type(learning_rate))
             if self._parameter_list is None:
                 raise AttributeError(
                     "parameters argument given to the Optimizer should not be None in dygraph mode."
@@ -126,13 +121,10 @@ class Optimizer(object):
                             "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
                             % weight_decay.__str__())
                         break
-        else:
-            if not isinstance(learning_rate, float) and \
-                    not isinstance(learning_rate, framework.Variable):
-                raise TypeError(
-                    "learning rate should be float or Tensor, got %s here" %
-                    type(learning_rate))
-
+        if not isinstance(learning_rate, (float, _LRScheduler)):
+            raise TypeError(
+                "learning rate should be float or _LRScheduler, got %s here" %
+                type(learning_rate))
         if grad_clip is not None:
             if not isinstance(grad_clip, GradientClipBase):
                 raise TypeError(
@@ -150,9 +142,6 @@ class Optimizer(object):
         # each program should have a independent learning rate
         # program -> tensor(learning_rate)
         self._learning_rate_map = dict()
-        if isinstance(self._learning_rate, framework.Variable):
-            self._learning_rate_map[framework.default_main_program(
-            )] = self._learning_rate
         # Dictionary of accumulators. Some optimizer subclasses need to
         # allocate and manage extra tensors associated with the parameters
         # to train. These tensors are called accumulators.
@@ -167,7 +156,7 @@ class Optimizer(object):
     @framework.dygraph_only
     def state_dict(self):
         '''
-        Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict.
+        Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be include in state dict.
         If the optimizer never be called(minimize function), the state_dict is empty.
 
         Args: 
@@ -192,24 +181,14 @@ class Optimizer(object):
             for para_name, var_tmp in v.items():
                 state_dict[var_tmp.name] = var_tmp
         # global step if use lr decay
-        if isinstance(self._learning_rate, LearningRateDecay):
+        if isinstance(self._learning_rate, _LRScheduler):
             state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
-
-            if not isinstance(self._learning_rate, _LearningRateEpochDecay):
-                var_tmp = None
-                var_temp = framework._varbase_creator(
-                    None, name='global_step', dtype='int32')
-
-                tensor.fill_constant(
-                    [1], "int32", self._learning_rate.step_num, out=var_temp)
-
-                state_dict['global_step'] = var_temp
         return state_dict
 
     @framework.dygraph_only
     def set_state_dict(self, state_dict):
         '''
-        Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
+        Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be changed.
 
         Args: 
             state_dict(dict) : Dict contains all the Tensor needed by optimizer
@@ -226,7 +205,7 @@ class Optimizer(object):
                 state_dict = emb.state_dict()
                 paddle.framework.save(state_dict, "paddle_dy")
 
-                adam = paddle.optimizer.Adam(learning_rate=paddle.nn.functional.noam_decay( 100, 10000), 
+                adam = paddle.optimizer.Adam(learning_rate=paddle.optimizer.NoamLR( 100, 10000), 
                                             parameters=emb.parameters())
                 state_dict = adam.state_dict()
                 paddle.framework.save(state_dict, "paddle_dy")
@@ -237,29 +216,8 @@ class Optimizer(object):
 
         '''
 
-        if isinstance(self._learning_rate, LearningRateDecay):
-            self._learning_rate.set_dict(state_dict["LR_Scheduler"])
-
-            if not isinstance(self._learning_rate, _LearningRateEpochDecay):
-                assert 'global_step' in state_dict, \
-                        'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict'
-                global_step = state_dict['global_step']
-
-                if isinstance(global_step, Variable):
-                    step_np = global_step
-                    step_np = np.array(step_np.value().get_tensor())
-                    assert step_np.shape == (1,),  \
-                            "global step shape is (1,), the shape is {}".format( step_np.shape )
-
-                    self._learning_rate.step_num = int(step_np[0])
-                elif isinstance(global_step, np.ndarray):
-                    assert global_step.shape == (1,),  \
-                            "global step shape is (1,), the shape is {}".format( global_step.shape )
-                    self._learning_rate.step_num = global_step[0]
-                else:
-                    raise RuntimeError(
-                        "Type not supprt, value in state dict must be [VarBase, Tensor, numpy], the type is ",
-                        type(global_step))
+        if isinstance(self._learning_rate, _LRScheduler):
+            self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
 
         self._accumulators_holder = state_dict
         for k, v in self._accumulators.items():
@@ -296,58 +254,49 @@ class Optimizer(object):
         return self._opti_name_list
 
     def _create_global_learning_rate(self):
-        if imperative_base.enabled():
-            # create learning rate tensor
-            if isinstance(self._learning_rate, float):
-                lr = self._global_learning_rate()
-
-                if isinstance(lr, framework.Variable):
-                    return
-                else:
-                    self._learning_rate_map[framework.default_main_program(
-                    )] = layers.create_global_var(
-                        name=unique_name.generate("learning_rate"),
-                        shape=[1],
-                        value=float(self._learning_rate),
-                        dtype=paddle.get_default_dtype()
-                        if self._dtype is None else self._dtype,
-                        persistable=True)
-            # get learning rate Tensor from LearningRateDecay
-            elif isinstance(self._learning_rate, LearningRateDecay):
+        if isinstance(self._learning_rate, _LRScheduler):
+            lr_var = self._global_learning_rate()
+            # only create global lr_var once
+            if not isinstance(lr_var, framework.Variable):
+                lr_name = unique_name.generate('learning_rate')
+                self._learning_rate._var_name = lr_name
+                lr_var = self.helper.create_global_variable(
+                    name=lr_name,
+                    shape=[1],
+                    persistable=True,
+                    stop_gradient=True,
+                    dtype=paddle.get_default_dtype()
+                    if self._dtype is None else self._dtype)
+                main_prog = framework.default_main_program()
+                main_prog.lr_sheduler = self._learning_rate
+                main_prog.lr_var = lr_var
                 self._learning_rate_map[framework.default_main_program(
-                )] = self._learning_rate()
-            else:
-                raise TypeError(
-                    "optimizer's learning rate must be float or LearningRateDecay"
-                )
-        else:
-            lr = self._global_learning_rate()
+                )] = lr_var
 
+            lr_value = float(self._learning_rate())
+            self.helper.set_variable_initializer(
+                lr_var, initializer=Constant(value=lr_value))
+        elif isinstance(self._learning_rate, float):
+            # only create global lr_var once
+            lr = self._global_learning_rate()
             if isinstance(lr, framework.Variable):
                 return
             else:
-                if not isinstance(self._learning_rate, float):
-                    raise TypeError(
-                        "learning rate Tensor is create outside optimizer,"
-                        "can not create new learning rate Tensor for new program"
-                    )
-
-            # create learning rate in the current main program
-            self._learning_rate_map[framework.default_main_program(
-            )] = layers.create_global_var(
-                name=unique_name.generate("learning_rate"),
-                shape=[1],
-                value=float(self._learning_rate),
-                dtype=paddle.get_default_dtype()
-                if self._dtype is None else self._dtype,
-                persistable=True)
+                self._learning_rate_map[framework.default_main_program(
+                )] = layers.create_global_var(
+                    name=unique_name.generate("learning_rate"),
+                    shape=[1],
+                    value=float(self._learning_rate),
+                    dtype=paddle.get_default_dtype()
+                    if self._dtype is None else self._dtype,
+                    persistable=True)
 
     @framework.dygraph_only
     def set_lr(self, value):
         """
         :api_attr: imperative
         
-        Set the value of the learning rate manually in the optimizer. If the optimizer use LearningRateDecay,
+        Set the value of the learning rate manually in the optimizer. If the optimizer use _LRScheduler,
         this API cannot be invoked, because it will lead to conflict.
 
         Args:
@@ -378,53 +327,36 @@ class Optimizer(object):
                 #    current lr is 0.5
                 #    current lr is 0.6
 
-
-                    # set learning rate manually by framework Tensor
-                    lr_var = paddle.create_global_var(
-                        shape=[1], value=0.7, dtype='float32')
-                    adam.set_lr(lr_var)
-                    lr = adam.get_lr()
-                    print("current lr is {}".format(lr))
-                    # Print:
-                    #    current lr is 0.7
-
-
-
         """
-        if not isinstance(value, (framework.Variable, float)):
+        if not isinstance(value, (int, float)):
             raise TypeError(
-                "The type of 'value' in optimizer.set_lr must be (float, Tensor), but received %s."
+                "The type of 'value' in optimizer.set_lr must be float, but received %s."
                 % (type(value)))
-        if isinstance(self._learning_rate, LearningRateDecay):
+        if isinstance(self._learning_rate, _LRScheduler):
             raise RuntimeError(
-                "optimizer's learning rate can't be LearningRateDecay when invoke this API, because this will lead to conflict."
+                "optimizer's learning rate can't be _LRScheduler when invoke this API, because this will lead to conflict."
             )
-        if isinstance(value, float):
-            self._learning_rate = value
-            current_lr = self._global_learning_rate()
-            if current_lr is not None:
-                global_block = framework.default_main_program().global_block()
-                global_block.append_op(
-                    type='fill_constant',
-                    outputs={'Out': [current_lr]},
-                    attrs={
-                        'dtype': current_lr.dtype,
-                        'shape': list(current_lr.shape),
-                        'value': float(value)
-                    },
-                    stop_gradient=True)
-        else:
-            assert len(value.shape) == 1 and value.shape[
-                0] == 1, "optimizer's learning rate must be 1-D Tensor with shape[1]"
-            self._learning_rate_map[framework.default_main_program()] = value
+        self._learning_rate = float(value)
+        current_lr = self._global_learning_rate()
+        if current_lr is not None:
+            global_block = framework.default_main_program().global_block()
+            global_block.append_op(
+                type='fill_constant',
+                outputs={'Out': [current_lr]},
+                attrs={
+                    'dtype': current_lr.dtype,
+                    'shape': list(current_lr.shape),
+                    'value': float(value)
+                },
+                stop_gradient=True)
 
     @framework.dygraph_only
     def get_lr(self):
         """
         :api_attr: imperative
         
-        Get current step learning rate. The return value is all the same When LearningRateDecay is not used,
-        otherwise return the step learning rate.
+        Get current step learning rate. The return value is all the same When _LRScheduler is not used,
+        otherwise return the current step learning rate.
 
         Returns:
             float: The learning rate of the current step.
@@ -434,14 +366,14 @@ class Optimizer(object):
 
                 import numpy as np
                 import paddle
-                # example1: LearningRateDecay is not used, return value is all the same
+                # example1: _LRScheduler is not used, return value is all the same
                 paddle.disable_static()
                 emb = paddle.nn.Embedding([10, 10])
                 adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
                 lr = adam.get_lr()
                 print(lr) # 0.001
 
-                # example2: PiecewiseDecay is used, return the step learning rate
+                # example2: PiecewiseLR is used, return the step learning rate
                 paddle.disable_static()
                 inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
                 linear = paddle.nn.Linear(10, 10)
@@ -451,7 +383,8 @@ class Optimizer(object):
                 
                 bd = [2, 4, 6, 8]
                 value = [0.2, 0.4, 0.6, 0.8, 1.0]
-                adam = paddle.optimizer.Adam(paddle.PiecewiseDecay(bd, value, 0),
+                scheduler = paddle.optimizer.PiecewiseLR(bd, value, 0)
+                adam = paddle.optimizer.Adam(scheduler,
                                        parameters=linear.parameters())
 
                 # first step: learning rate is 0.2
@@ -462,24 +395,14 @@ class Optimizer(object):
                 for i in range(12):
                     adam.step()
                     lr = adam.get_lr()
+                    scheduler.step()
                     np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True
 
         """
-        current_lr = self._global_learning_rate()
-        if isinstance(current_lr, framework.Variable):
-            return self._global_learning_rate().numpy()[0]
-
         if isinstance(self._learning_rate, float):
             return self._learning_rate
-        elif isinstance(self._learning_rate, _LearningRateEpochDecay):
-            step_lr = self._learning_rate()
-            return step_lr.numpy()[0]
         else:
-            step_lr = self._learning_rate.step()
-            if isinstance(step_lr, (float, int)):
-                return step_lr
-            else:
-                return step_lr.numpy()[0]
+            return self._learning_rate()
 
     def _global_learning_rate(self, program=None):
         """
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb3a578e15724e9501d69dc209bdedc65afeb82b
--- /dev/null
+++ b/python/paddle/optimizer/sgd.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+from ..fluid.dygraph import no_grad
+__all__ = ["SGD"]
+
+
+class SGD(Optimizer):
+    """
+    Optimizer of the stochastic gradient descent algorithm.
+
+    .. math::
+
+        param\_out = param - learning\_rate * grad
+
+    Parameters:
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` . 
+        
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            back = out.backward()
+            sgd.step()
+            sgd.clear_grad()
+
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set")
+        super(SGD, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "sgd"
+
+    @no_grad()
+    def _append_optimize_op(self, block, param_and_grad):
+        lr = self._create_param_lr(param_and_grad)
+        if framework.in_dygraph_mode():
+            core.ops.sgd(param_and_grad[0], lr, param_and_grad[1],
+                         param_and_grad[0])
+            return None
+
+        assert isinstance(block, framework.Block)
+        # create the optimize op
+        sgd_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": lr
+            },
+            outputs={"ParamOut": param_and_grad[0]},
+            stop_gradient=True)
+
+        return sgd_op
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index 06b9c7cdbef5dd11d237a2b85586e598611bf83e..eb70320ea7551de6e1117900e3769f000fdf23dd 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-import numpy as np
 import six
 
-from paddle.fluid import core
+import paddle
+from paddle.fluid import core, Variable
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.data_feeder import check_dtype, check_type
+from paddle.fluid.data_feeder import check_type
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
 
 __all__ = ['data', 'InputSpec']
 
@@ -41,7 +41,7 @@ def data(name, shape, dtype=None, lod_level=0):
            size. For example, it is useful to set changeable batch size as "None" or -1.
        dtype (np.dtype|str, optional): The type of the data. Supported
            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
-           uint8. Default: None. When `dtype` is not set, the dtype will get 
+           uint8. Default: None. When `dtype` is not set, the dtype will get
            from the global dtype by `paddle.get_default_dtype()`.
        lod_level (int, optional): The LoD level of the LoDTensor. Usually users
            don't have to set this value. For more details about when and how to
@@ -54,13 +54,12 @@ def data(name, shape, dtype=None, lod_level=0):
         .. code-block:: python
 
           import numpy as np
-          import paddle.fluid as fluid
           import paddle
 
           # Creates a variable with fixed size [3, 2, 1]
           # User can only feed data of the same shape to x
           # the dtype is not set, so it will set "float32" by
-          # paddle.get_default_dtype(). You can use paddle.get_default_dtype() to 
+          # paddle.get_default_dtype(). You can use paddle.get_default_dtype() to
           # change the global dtype
           x = paddle.static.data(name='x', shape=[3, 2, 1])
 
@@ -75,8 +74,8 @@ def data(name, shape, dtype=None, lod_level=0):
           # and fetch z, like implementing "1 + 1 = 2" in PaddlePaddle
           feed_data = np.ones(shape=[3, 2, 1], dtype=np.float32)
 
-          exe = fluid.Executor(fluid.CPUPlace())
-          out = exe.run(fluid.default_main_program(),
+          exe = paddle.static.Executor(paddle.framework.CPUPlace())
+          out = exe.run(paddle.static.default_main_program(),
                         feed={
                             'x': feed_data,
                             'y': feed_data
@@ -120,11 +119,13 @@ def data(name, shape, dtype=None, lod_level=0):
 
 class InputSpec(object):
     """
-    Define input specification of the model.
+    InputSpec describes the signature information of the model input, such as ``shape`` , ``dtype`` , ``name`` .
+
+    This interface is often used to specify input tensor information of models in high-level API.
+    It's also used to specify the tensor information for each input parameter of the forward function
+    decorated by `@paddle.jit.to_static`.
 
     Args:
-        name (str): The name/alias of the variable, see :ref:`api_guide_Name`
-            for more details.
         shape (tuple(integers)|list[integers]): List|Tuple of integers
             declaring the shape. You can set "None" or -1 at a dimension
             to indicate the dimension can be of any size. For example,
@@ -132,18 +133,28 @@ class InputSpec(object):
         dtype (np.dtype|str, optional): The type of the data. Supported
             dtype: bool, float16, float32, float64, int8, int16, int32, int64,
             uint8. Default: float32.
+        name (str): The name/alias of the variable, see :ref:`api_guide_Name`
+            for more details.
 
     Examples:
         .. code-block:: python
 
-        from paddle.static import InputSpec
+            from paddle.static import InputSpec
+
+            input = InputSpec([None, 784], 'float32', 'x')
+            label = InputSpec([None, 1], 'int64', 'label')
 
-        input = InputSpec([None, 784], 'float32', 'x')
-        label = InputSpec([None, 1], 'int64', 'label')
+            print(input)  # InputSpec(shape=(-1, 784), dtype=VarType.FP32, name=x)
+            print(label)  # InputSpec(shape=(-1, 1), dtype=VarType.INT64, name=label)
     """
 
-    def __init__(self, shape=None, dtype='float32', name=None):
-        self.shape = shape
+    def __init__(self, shape, dtype='float32', name=None):
+        # replace `None` in shape  with -1
+        self.shape = self._verify(shape)
+        # convert dtype into united represention
+        if dtype is not None:
+            if not isinstance(dtype, core.VarDesc.VarType):
+                dtype = convert_np_dtype_to_dtype_(dtype)
         self.dtype = dtype
         self.name = name
 
@@ -153,3 +164,167 @@ class InputSpec(object):
     def __repr__(self):
         return '{}(shape={}, dtype={}, name={})'.format(
             type(self).__name__, self.shape, self.dtype, self.name)
+
+    @classmethod
+    def from_tensor(cls, tensor, name=None):
+        """
+        Generates a InputSpec based on the description of input tensor.
+
+        Args:
+            tensor(Tensor): the source tensor to generate a InputSpec instance
+
+        Returns:
+            A InputSpec instance generated from Tensor.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                from paddle.static import InputSpec
+
+                paddle.disable_static()
+
+                x = paddle.to_tensor(np.ones([2, 2], np.float32))
+                x_spec = InputSpec.from_tensor(x, name='x')
+                print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
+
+        """
+        if isinstance(tensor, (Variable, core.VarBase)):
+            return cls(tensor.shape, tensor.dtype, name or tensor.name)
+        else:
+            raise ValueError(
+                "Input `tensor` should be a Tensor, but received {}.".format(
+                    type(tensor).__name__))
+
+    @classmethod
+    def from_numpy(cls, ndarray, name=None):
+        """
+        Generates a InputSpec based on the description of input np.ndarray.
+
+        Args:
+            tensor(Tensor): the source numpy ndarray to generate a InputSpec instance
+
+        Returns:
+            A InputSpec instance generated from Tensor.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                from paddle.static import InputSpec
+
+                x = np.ones([2, 2], np.float32)
+                x_spec = InputSpec.from_numpy(x, name='x')
+                print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
+
+        """
+        return cls(ndarray.shape, ndarray.dtype, name)
+
+    def batch(self, batch_size):
+        """
+        Inserts `batch_size` in front of the `shape`.
+
+        Args:
+            batch_size(int): the inserted integer value of batch size.
+
+        Returns:
+            The original InputSpec instance by inserting `batch_size` in front of `shape`.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.static import InputSpec
+
+                x_spec = InputSpec(shape=[64], dtype='float32', name='x')
+                x_spec.batch(4)
+                print(x_spec) # InputSpec(shape=(4, 64), dtype=VarType.FP32, name=x)
+
+        """
+        if isinstance(batch_size, (list, tuple)):
+            if len(batch_size) != 1:
+                raise ValueError(
+                    "Length of batch_size: {} shall be 1, but received {}.".
+                    format(batch_size, len(batch_size)))
+            batch_size = batch_size[1]
+        elif not isinstance(batch_size, six.integer_types):
+            raise TypeError("type(batch_size) shall be `int`, but received {}.".
+                            format(type(batch_size).__name__))
+
+        new_shape = [batch_size] + list(self.shape)
+        self.shape = tuple(new_shape)
+
+        return self
+
+    def unbatch(self):
+        """
+        Removes the first element of `shape`.
+
+        Returns:
+            The original InputSpec instance by removing the first element of `shape` .
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.static import InputSpec
+
+                x_spec = InputSpec(shape=[4, 64], dtype='float32', name='x')
+                x_spec.unbatch()
+                print(x_spec) # InputSpec(shape=(64,), dtype=VarType.FP32, name=x)
+
+        """
+        if len(self.shape) == 0:
+            raise ValueError(
+                "Not support to unbatch a InputSpec when len(shape) == 0.")
+
+        self.shape = self._verify(self.shape[1:])
+        return self
+
+    def _verify(self, shape):
+        """
+        Verifies the input shape and modifies `None` into `-1`.
+        """
+        if not isinstance(shape, (list, tuple)):
+            raise TypeError(
+                "Type of `shape` in InputSpec should be one of (tuple, list), but received {}.".
+                format(type(shape).__name__))
+        if len(shape) == 0:
+            raise ValueError(
+                "`shape` in InputSpec should contain at least 1 element, but received {}.".
+                format(shape))
+
+        for i, ele in enumerate(shape):
+            if ele is not None:
+                if not isinstance(ele, six.integer_types):
+                    raise ValueError(
+                        "shape[{}] should be an `int`, but received `{}`:{}.".
+                        format(i, type(ele).__name__, ele))
+            if ele is None or ele < -1:
+                shape[i] = -1
+
+        return tuple(shape)
+
+    def __hash__(self):
+        # Note(Aurelius84): `name` is not considered as a field to compute hashkey.
+        # Because it's no need to generate a new program in following cases while using
+        # @paddle.jit.to_static.
+        #
+        # Case 1:
+        #      foo(x_var)
+        #      foo(y_var)
+        #  x_var and y_var hold same shape and dtype, they should share a same program.
+        #
+        #
+        # Case 2:
+        #      foo(x_var)
+        #      foo(x_np)  # x_np is a numpy.ndarray.
+        #  x_var and x_np hold same shape and dtype, they should also share a same program.
+        return hash((tuple(self.shape), self.dtype))
+
+    def __eq__(self, other):
+        slots = ['shape', 'dtype', 'name']
+        return (type(self) is type(other) and all(
+            getattr(self, attr) == getattr(other, attr) for attr in slots))
+
+    def __ne__(self, other):
+        return not self == other
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
old mode 100644
new mode 100755
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 1911d8ccc25e01ee6419fd26126881304ab61f01..9ef66712540aa54eac39b7e6160c5c91b6e3fcd5 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -71,22 +71,22 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     Args:
         data(scalar|tuple|list|ndarray|Tensor|ComplexTensor): Initial data for the tensor.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
-        dtype(str, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8'. And
-            'complex64' , 'complex128' only for ComplexTensor.
-            Default: None, infers data type from ``data`` .
+            'complex64' , 'complex128' only for ComplexTensor. Default: None, for float point number, 
+            get type from ``get_default_type``, for other type, infers from ``data`` .
         place(CPUPlace|CUDAPinnedPlace|CUDAPlace, optional): The place to allocate Tensor. Can be  
             CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
 
     Returns:
-        Tensor: A Tensor or ComplexTensor constructed from ``data``.
+        Tensor: A Tensor or ComplexTensor constructed from ``data`` .
 
     Raises:
         TypeError: If the data type of ``data`` is not scalar, list, tuple, numpy.ndarray, paddle.Tensor, paddle.ComplexTensor
         ValueError: If ``data`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]
         TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
-        ValueError: If ``place`` is not paddle.Place, paddle.CUDAPinnedPlace, paddle.CUDAPlace
+        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace
 
     Examples:
 
@@ -94,7 +94,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
         import paddle
         import numpy as np
-        paddle.enable_imperative()
+        paddle.disable_static()
                 
         type(paddle.to_tensor(1))
         # <class 'paddle.Tensor'>
@@ -132,7 +132,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
         #   - dtype: double
         #   - data: [0.1 0.2 0.3 0.4]
 
-        type(paddle.to_tensor([[1+1j, 2], [3+2j, 4]]), , dtype='complex64')
+        type(paddle.to_tensor([[1+1j, 2], [3+2j, 4]]), dtype='complex64')
         # <class 'paddle.ComplexTensor'>
 
         paddle.to_tensor([[1+1j, 2], [3+2j, 4]], dtype='complex64')
@@ -189,12 +189,13 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor|paddle.ComplexTensor".
                 format(type(data)))
 
-    if dtype:
-        dtype = convert_dtype(dtype)
-        if dtype != data.dtype:
-            data = data.astype(dtype)
-
     if not np.iscomplexobj(data):
+        if dtype:
+            dtype = convert_dtype(dtype)
+        elif data.dtype in ['float16', 'float32', 'float64']:
+            dtype = paddle.framework.get_default_dtype()
+        if dtype and dtype != data.dtype:
+            data = data.astype(dtype)
         return paddle.Tensor(
             value=data,
             place=place,
@@ -202,6 +203,14 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             zero_copy=True,
             stop_gradient=stop_gradient)
     else:
+        if dtype:
+            dtype = convert_dtype(dtype)
+        else:
+            dtype = paddle.framework.get_default_dtype()
+            dtype = 'complex64' if dtype in ['float16', 'float32'
+                                             ] else 'complex128'
+        if dtype != data.dtype:
+            data = data.astype(dtype)
         name = unique_name.generate('generated_tensor')
         real_tensor = paddle.Tensor(
             value=data.real,
@@ -978,6 +987,13 @@ def diag(x, offset=0, padding_value=0, name=None):
     check_type(x, 'x', (Variable), 'diag_v2')
     check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
                 'diag_v2')
+    check_type(offset, 'offset', (int), 'diag_v2')
+    check_type(padding_value, 'padding_value', (int, float), 'diag_v2')
+    if len(x.shape) != 1 and len(x.shape) != 2:
+        raise ValueError(
+            "The dimension of input x must be either 1 or 2, but received {}".
+            format(len(x.shape)))
+
     helper = LayerHelper("diag_v2", **locals())
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index a7bf2272a599ef2d6de076e7129b43152ca47b06..b5b528325cd9f52a8b61ef21df0095c41da5a8ed 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
 from paddle.common_ops_import import *
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type
@@ -170,7 +171,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
-def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
+def norm(x, p='fro', axis=None, keepdim=False, name=None):
     """
 	:alias_main: paddle.norm
 	:alias: paddle.norm,paddle.tensor.norm,paddle.tensor.linalg.norm
@@ -179,20 +180,19 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
     or 2-norm, and in general the p-norm for p > 0) of a given tensor.
 
     Args:
-        input (Variable): The input tensor could be N-D tensor, and the input data
+        x (Tensor): The input tensor could be N-D tensor, and the input data
             type could be float32 or float64.
-        p (float|string, optional): Order of the norm. Supported values are `fro`, `1`, `2`,
-            and any positive real number yielding the corresponding p-norm.
-        axis (int|list, optional): The axis on which to apply norm operation. If axis is int
-            or list with only one element, the vector norm is computed over the axis.
-            If axis is a list with two elements, the matrix norm is computed over the axis.
+        p (float|string, optional): Order of the norm. Supported values are `fro`, `0`, `1`, `2`,
+           `inf`,`-inf` and any positive real number yielding the corresponding p-norm.
+            Not supported: ord < 0, nuclear norm.
+        axis (int|list|tuple, optional): The axis on which to apply norm operation. If axis is int
+            or list(int)/tuple(int)  with only one element, the vector norm is computed over the axis.
             If `axis < 0`, the dimension to norm operation is rank(input) + axis.
+            If axis is a list(int)/tuple(int) with two elements, the matrix norm is computed over the axis.
         keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have fewer dimension
             than the :attr:`input` unless :attr:`keepdim` is true, default
             value is False.
-        out (Variable, optional): The output tensor, default value is None. It's data type
-            must be the same as the input Tensor.
         name (str, optional): The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
@@ -208,29 +208,57 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
         .. code-block:: python
             
             import paddle
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[2, 3, 5], dtype='float64')
-            
+            import numpy as np
+            paddle.disable_static()
+            shape=[2, 3, 4]
+            np_input = np.arange(24).astype('float32') - 12
+            np_input = np_input.reshape(shape)
+            x = paddle.to_tensor(np_input)
+            #[[[-12. -11. -10.  -9.] [ -8.  -7.  -6.  -5.] [ -4.  -3.  -2.  -1.]]
+            # [[  0.   1.   2.   3.] [  4.   5.   6.   7.] [  8.   9.  10.  11.]]]
+
             # compute frobenius norm along last two dimensions.
-            out_fro = paddle.norm(x, p='fro', axis=[1,2])
-            
+            out_fro = paddle.norm(x, p='fro', axis=[0,1])
+            # out_fro.numpy() [17.435596 16.911535 16.7332   16.911535]
+
             # compute 2-order vector norm along last dimension.
             out_pnorm = paddle.norm(x, p=2, axis=-1)
+            #out_pnorm.numpy(): [[21.118711  13.190906   5.477226]
+            #                    [ 3.7416575 11.224972  19.131126]]
+
+            # compute 2-order  norm along [0,1] dimension.
+            out_pnorm = paddle.norm(x, p=2, axis=[0,1])
+            #out_pnorm.numpy(): [17.435596 16.911535 16.7332   16.911535]
+
+            # compute inf-order  norm
+            out_pnorm = paddle.norm(x, p=np.inf)
+            #out_pnorm.numpy()  = [12.]
+            out_pnorm = paddle.norm(x, p=np.inf, axis=0)
+            #out_pnorm.numpy(): [[12. 11. 10. 9.] [8. 7. 6. 7.] [8. 9. 10. 11.]]
+
+            # compute -inf-order  norm
+            out_pnorm = paddle.norm(x, p=-np.inf)
+            #out_pnorm.numpy(): [0.]
+            out_pnorm = paddle.norm(x, p=-np.inf, axis=0)
+            #out_pnorm.numpy(): [[0. 1. 2. 3.] [4. 5. 6. 5.] [4. 3. 2. 1.]]
     """
 
-    def frobenius_norm(input, dim=None, keepdim=False, out=None, name=None):
+    def frobenius_norm(input, dim=None, keepdim=False, name=None):
         """
         The frobenius norm OP is to calculate the frobenius norm of certain two dimensions of Tensor `input`.
         Args:
           input (Variable): Tensor, data type float32, float64.
           dim (list, optional): None for last two dimensions.
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
-          out (Variable, optional): The tensor variable storing the output.
         """
         if dim is not None and not (isinstance(dim, list) and len(dim) == 2):
             raise ValueError(
                 "The dim of frobenius norm op should be None or two elements list!"
             )
+        if in_dygraph_mode():
+            if dim is None: dim = [-1]
+            return core.ops.frobenius_norm(input, 'dim', dim, 'keepdim',
+                                           keepdim)
         attrs = {
             'dim': dim if dim != None else [-2, -1],
             'keep_dim': keepdim,
@@ -242,16 +270,8 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
                                  'frobenius_norm')
 
         helper = LayerHelper('frobenius_norm', **locals())
-        if out is None:
-            out = helper.create_variable_for_type_inference(
-                dtype=helper.input_dtype())
-        else:
-            check_type(out, 'out', (Variable), 'frobenius_norm')
-            check_dtype(
-                out.dtype, out.name,
-                convert_dtype(input.dtype), 'frobenius_norm',
-                '(The out data type in frobenius_norm must be the same with input data type.)'
-            )
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
 
         helper.append_op(
             type='frobenius_norm',
@@ -264,7 +284,7 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
                     porder=None,
                     axis=None,
                     keepdim=False,
-                    out=None,
+                    asvector=False,
                     name=None):
         """
         Calculate the p-order vector norm for certain  dimension of Tensor `input`.
@@ -273,32 +293,28 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
           porder (float, optional): None for porder=2.0.
           axis (int, optional): None for last dimension.
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
-          out (Variable, optional): The tensor variable storing the output.
         """
+        if in_dygraph_mode():
+            if axis is None: axis = -1
+            return core.ops.p_norm(input, 'porder', porder, 'axis', axis,
+                                   'keepdim', keepdim, 'asvector', asvector)
         if porder is not None:
             check_type(porder, 'porder', (float, int), 'p_norm')
         if axis is not None:
             check_type(axis, 'axis', (int), 'p_norm')
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'p_norm')
+
         attrs = {
             'axis': axis if axis is not None else -1,
             'porder': float(porder) if porder is not None else 2.0,
             'keepdim': keepdim,
+            'asvector': asvector,
             'epsilon': 1e-12,
         }
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'p_norm')
-
         helper = LayerHelper('p_norm', **locals())
-        if out is None:
-            out = helper.create_variable_for_type_inference(
-                dtype=helper.input_dtype())
-        else:
-            check_type(out, 'out', (Variable), 'p_norm')
-            check_dtype(
-                out.dtype, out.name,
-                convert_dtype(input.dtype), 'p_norm',
-                '(The out data type in p_norm must be the same with input data type.)'
-            )
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
 
         helper.append_op(
             type='p_norm',
@@ -307,21 +323,126 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
             attrs=attrs)
         return out
 
+    def inf_norm(input,
+                 porder=None,
+                 axis=axis,
+                 keepdim=False,
+                 asvector=False,
+                 name=None):
+        helper = LayerHelper('frobenius_norm', **locals())
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+        helper.append_op(type='abs', inputs={'X': input}, outputs={'Out': out})
+        reduce_out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+
+        reduce_all = True if axis == None or axis == [] or asvector == True else False
+        axis = axis if axis != None and axis != [] else [0]
+
+        reduce_type = 'reduce_max' if porder == np.float(
+            'inf') else 'reduce_min'
+        helper.append_op(
+            type=reduce_type,
+            inputs={'X': out},
+            outputs={'Out': reduce_out},
+            attrs={'dim': axis,
+                   'keep_dim': keepdim,
+                   'reduce_all': reduce_all})
+
+        return reduce_out
+
+    def p0_matrix_norm(input, porder=0., axis=axis, keepdim=False, name=None):
+        block = LayerHelper('norm', **locals())
+        out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+
+        cast_out = block.create_variable_for_type_inference(dtype=bool)
+        block.append_op(
+            type='cast',
+            inputs={'X': input},
+            outputs={'Out': cast_out},
+            attrs={
+                'in_dtype': input.dtype,
+                'out_dtype': int(core.VarDesc.VarType.BOOL)
+            })
+        cast_out2 = block.create_variable_for_type_inference(dtype=bool)
+        block.append_op(
+            type='cast',
+            inputs={'X': cast_out},
+            outputs={'Out': cast_out2},
+            attrs={
+                'in_dtype': cast_out.dtype,
+                'out_dtype': int(core.VarDesc.VarType.FP32)
+            })
+        sum_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': cast_out2},
+            outputs={'Out': sum_out},
+            attrs={
+                'dim': axis,
+                'keep_dim': keepdim,
+                'reduce_all': True if axis is None else False
+            })
+        return sum_out
+
+    def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
+        block = LayerHelper('norm', **locals())
+        out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        abs_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        block.append_op(
+            type='abs', inputs={'X': input}, outputs={'Out': abs_out})
+        pow_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+
+        block.append_op(
+            type='pow',
+            inputs={'X': abs_out},
+            outputs={'Out': pow_out},
+            attrs={'factor': porder})
+        sum_out = block.create_variable_for_type_inference(
+            dtype=block.input_dtype())
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': pow_out},
+            outputs={'Out': sum_out},
+            attrs={
+                'dim': axis,
+                'keep_dim': keepdim,
+                'reduce_all': True if axis is None else False
+            })
+        porder
+        block.append_op(
+            type='pow',
+            inputs={'X': sum_out},
+            outputs={'Out': out},
+            attrs={'factor': float(1. / porder)})
+        return out
+
     if axis is None and p is not None:
         if isinstance(p, str):
             if p == "fro":
-                return frobenius_norm(
-                    input, dim=axis, keepdim=keepdim, out=out, name=name)
+                return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
             else:
                 raise ValueError(
                     "only valid string values are 'fro', found {}".format(p))
         elif isinstance(p, (int, float)):
             return vector_norm(
-                input, porder=p, axis=axis, keepdim=keepdim, out=out, name=name)
+                x,
+                porder=p,
+                axis=axis,
+                keepdim=keepdim,
+                asvector=True,
+                name=name)
         else:
             raise ValueError("only valid p type is string or float, found {}".
                              format(type(p)))
 
+    if isinstance(axis, tuple):
+        axis = list(axis)
     if isinstance(axis, list) and len(axis) == 1:
         axis = axis[0]
 
@@ -329,7 +450,12 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
     if isinstance(axis, int):
         if isinstance(p, (int, float)):
             return vector_norm(
-                input, axis=axis, porder=p, keepdim=keepdim, out=out, name=name)
+                x,
+                axis=axis,
+                porder=p,
+                keepdim=keepdim,
+                asvector=False,
+                name=name)
         else:
             raise ValueError(
                 "unspport p for p-order vector norm. except float, found {}".
@@ -337,11 +463,14 @@ def norm(input, p='fro', axis=None, keepdim=False, out=None, name=None):
     #calculate matrix norm, where axis is list with two integers
     elif isinstance(axis, list) and len(axis) == 2:
         if p == "fro":
-            return frobenius_norm(
-                input, dim=axis, keepdim=keepdim, out=out, name=name)
+            return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
+        elif p == 0:
+            return p0_matrix_norm(x, axis=axis, keepdim=keepdim, name=name)
+        elif p == np.inf or p == -np.inf:
+            return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
         else:
-            raise ValueError(
-                "unspport p for matrix norm, expcept 'fro', found {}".format(p))
+            return p_matrix_norm(
+                x, porder=p, axis=axis, keepdim=keepdim, name=name)
     else:
         raise ValueError(
             "except axis type int or list (length of list <=2), found {}".
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 44ec0a5a4df985d5217011a841065ce504483ab7..845d2cf4d199328bbf8d0e03cd3a7a24a61aafd2 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -23,7 +23,6 @@ from ..fluid.layers import utils
 import numpy as np
 # TODO: define functions to manipulate a tensor  
 from ..fluid.layers import cast  #DEFINE_ALIAS
-from ..fluid.layers import expand_as  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
 from ..fluid.layers import strided_slice  #DEFINE_ALIAS
 from ..fluid.layers import transpose  #DEFINE_ALIAS
@@ -612,6 +611,7 @@ def unique(x,
            return_inverse=False,
            return_counts=False,
            axis=None,
+           dtype="int64",
            name=None):
     """
     Returns the unique elements of `x` in ascending order.
@@ -625,6 +625,8 @@ def unique(x,
         return_counts(bool, optional): If True, also return the counts for each unique element.
         axis(int, optional): The axis to apply unique. If None, the input will be flattened.
             Default: None.
+        dtype(np.dtype|str, optional): The date type of `indices` or `inverse` tensor: int32 or int64.
+            Default: int64.
         name(str, optional): Name for the operation. For more information, please refer to
             :ref:`api_guide_Name`. Default: None.
 
@@ -650,6 +652,7 @@ def unique(x,
             np_counts = counts.numpy() # [1 1 3 1]
 
             x_data = np.array([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
+            x = paddle.to_tensor(x_data)
             unique = paddle.unique(x)
             np_unique = unique.numpy() # [0 1 2 3]
 
@@ -662,11 +665,10 @@ def unique(x,
         axis = []
     else:
         axis = [axis]
-
+    attr_dtype = convert_np_dtype_to_dtype_(dtype)
     if in_dygraph_mode():
         out, inverse, indices, counts = core.ops.unique(
-            x, 'dtype',
-            convert_np_dtype_to_dtype_('int32'), 'return_index', return_index,
+            x, 'dtype', attr_dtype, 'return_index', return_index,
             'return_inverse', return_inverse, 'return_counts', return_counts,
             'axis', axis, "is_sorted", True)
         outs = [out]
@@ -687,12 +689,13 @@ def unique(x,
     check_type(return_index, 'return_index', bool, 'unique')
     check_type(return_inverse, 'return_inverse', bool, 'unique')
     check_type(return_counts, 'return_counts', bool, 'unique')
+    check_dtype(dtype, 'dtype', ['int32', 'int64'], 'unique')
     if len(axis) != 0:
         check_type(axis[0], 'axis', int, 'unique')
 
     helper = LayerHelper('unique', **locals())
     attrs = {
-        'dtype': int(core.VarDesc.VarType.INT32),
+        'dtype': attr_dtype,
         "return_index": return_index,
         "return_inverse": return_inverse,
         "return_counts": return_counts,
@@ -702,19 +705,19 @@ def unique(x,
     out = helper.create_variable_for_type_inference(
         dtype=x.dtype, stop_gradient=True)
     inverse = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+        dtype=attr_dtype, stop_gradient=True)
     outputs = {"Out": out, "Index": inverse}
     outs = [out]
     if return_index:
         indices = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+            dtype=attr_dtype, stop_gradient=True)
         outputs["Indices"] = indices
         outs.append(indices)
     if return_inverse:
         outs.append(inverse)
     if return_counts:
         counts = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+            dtype=attr_dtype, stop_gradient=True)
         outputs["Counts"] = counts
         outs.append(counts)
 
@@ -1096,6 +1099,9 @@ def tile(x, repeat_times, name=None):
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
+    if in_dygraph_mode():
+        return core.ops.tile(x, 'repeat_times', repeat_times)
+
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'tile')
     check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
@@ -1105,9 +1111,6 @@ def tile(x, repeat_times, name=None):
             "must set its stop_gradient to be True by "
             "some_var.stop_gradient == True supporting some_var is the input.")
 
-    if in_dygraph_mode():
-        return core.ops.tile(x, 'repeat_times', repeat_times)
-
     helper = LayerHelper('tile', **locals())
 
     inputs = {"X": [x]}
@@ -1172,6 +1175,9 @@ def expand_as(x, y, name=None):
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
+    if in_dygraph_mode():
+        return core.ops.expand_as_v2(x, y)
+
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand_as')
     check_type(y, 'y', Variable, 'expand_as')
@@ -1184,9 +1190,6 @@ def expand_as(x, y, name=None):
             "some_var as the input 'x'.")
     inputs = {"X": [x], "target_tensor": [y]}
 
-    if in_dygraph_mode():
-        return core.ops.expand_as_v2(x, y)
-
     helper = LayerHelper('expand_as', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
@@ -1225,6 +1228,9 @@ def expand(x, shape, name=None):
             out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
+    if in_dygraph_mode():
+        return core.ops.expand_v2(x, 'shape', shape)
+
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
     check_type(shape, 'shape', (list, tuple, Variable), 'expand')
@@ -1237,9 +1243,6 @@ def expand(x, shape, name=None):
                          "some_var.stop_gradient = True, supporting "
                          "some_var as the input.")
 
-    if in_dygraph_mode():
-        return core.ops.expand_v2(x, 'shape', shape)
-
     helper = LayerHelper('expand', **locals())
 
     def get_attr_expand_shape(list_expand_shape):
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 9dfb31a5ac25b2afc9fe52bfc8bab5ad277d80b8..0d87c1c2cf705372de7b8534cf8faea1bb5320a6 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -15,14 +15,16 @@
 math functions
 """
 from __future__ import print_function
+import numpy as np
 
 from paddle.common_ops_import import *
+from paddle.tensor import cast
+import paddle
 from ..fluid import layers
 from ..fluid.framework import core, _varbase_creator, in_dygraph_mode, Variable
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
-import sys
 
 # TODO: define math functions
 # yapf: disable
@@ -64,6 +66,7 @@ from ..fluid.layers import sums    #DEFINE_ALIAS
 from ..fluid import layers
 import paddle
 
+
 __all__ = [
         'abs',
         'acos',
@@ -86,8 +89,8 @@ __all__ = [
         'logsumexp',
         'mul',
         'multiplex',
-        'prod',
         'pow',
+        'prod',
         'reciprocal',
         'reduce_max',
         'reduce_min',
@@ -147,64 +150,87 @@ _supported_float_dtype_ = [
     VarDesc.VarType.FP64,
 ]
 
-@templatedoc()
-def pow(input, exponent, name=None):
+def pow(x, y, name=None):
     """
-	:alias_main: paddle.pow
-	:alias: paddle.pow,paddle.tensor.pow,paddle.tensor.math.pow
+    Compute the power of tensor elements. The equation is:
 
-    This is Pow Activation Operator.
+    .. math::
+        out = x^{y} 
 
-    :math:`out = input^{exponent}`
+    **Note**:
+    ``paddle.pow`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-    Args:
-        input(Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float32`` or ``float64``.
-        exponent(float32|Variable): A scalar with type ``float32`` or a ``Tensor`` with shape [1] and type ``float32``.
-        name(str, optional): The default value is None. Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
 
+    Args:
+        x (Tensor): An N-D Tensor, the data type is float32, float64, int32 or int64.
+        y (Tensor): An N-D Tensor with type float32, float64, int32 or int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+    
     Returns:
-        Variable: A ``Tensor`` or ``LoDTensor``. The data type is same as ``input``.
+        N-D Tensor. A location into which the result is stored. Its dimension equals with $x$.
 
     Examples:
 
-        .. code-block:: python
+        ..  code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-
-            x = fluid.data(name="x", shape=[32,32], dtype="float32")
+            import numpy as np
 
-            # example 1: argument exponent is float
-            y_1 = paddle.pow(x, 2.0)
-            # y_1 is x^{2.0}
+            paddle.disable_static()
+            
+            # example 1: y is a float
+            x_data = np.array([1, 2, 3])
+            y = 2
+            x = paddle.to_tensor(x_data)
+            res = paddle.pow(x, y)
+            print(res.numpy()) # [1 4 9]
+            
+            # example 2: y is a Tensor
+            y = paddle.fill_constant(shape=[1], value=2, dtype='float32')
+            res = paddle.pow(x, y)
+            print(res.numpy()) # [1 4 9]
 
-            # example 2: argument exponent is Variable
-            exponent_tensor = fluid.layers.fill_constant([1], "float32", 3.0)
-            y_2 = paddle.pow(x, exponent_tensor)
-            # y_2 is x^{3.0}
     """
+    # in dynamic graph mode
     if in_dygraph_mode():
-        return core.ops.pow(input, "exponent", exponent)
-
-    helper = LayerHelper('pow', **locals())
-    inputs = {'X': input}
-    attrs = {}
-    if isinstance(exponent, Variable):
-        exponent.stop_gradient = True
-        inputs['FactorTensor'] = exponent
+        if isinstance(y, (int, float)):
+            return core.ops.pow(x, 'factor', y)
+        elif isinstance(y, (paddle.Tensor, Variable)):
+
+            if x.dtype != y.dtype:
+                y = cast(y, dtype='float64')
+                x = cast(x, dtype='float64')
+                out_dygraph = _elementwise_op_in_dygraph(
+                x, y, axis=-1, act=None, op_name='elementwise_pow')
+                return out_dygraph
+
+            return _elementwise_op_in_dygraph(
+                x, y, axis=-1, act=None, op_name='elementwise_pow')
+        else:
+            raise TypeError('y must be scalar or tensor type, but received: %s '% (y.dtype))
+    # in static graph mode
     else:
-        attrs['factor'] = exponent
-
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    check_dtype(
-        out.dtype, out.name,
-        convert_dtype(input.dtype), 'pow',
-        '(The out data type in pow must be the same with input data type.)')
+        if isinstance(y, (int, float)):
+            helper = LayerHelper('pow', **locals())
+            inputs = {'X': x}
+            attrs = {'factor': y}
+            out = helper.create_variable_for_type_inference(dtype=x.dtype)
+            helper.append_op(
+                type='pow', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+            return out
+        elif isinstance(y, (paddle.Tensor, Variable)):
+            # TODO A potential speed improvement is supporting different types in C++ and removing the cast ops here
+            helper = LayerHelper('elementwise_pow', **locals())
+            if x.dtype != y.dtype:
+                y = cast(y, dtype='float64')
+                x = cast(x, dtype='float64')
+                out = helper.create_variable_for_type_inference(dtype=x.dtype)
+            else:
+                out = helper.create_variable_for_type_inference(dtype=x.dtype)
+            return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
+        else:
+            raise TypeError('y must be scalar or tensor type, but received: %s '% (type(y)))
 
-    helper.append_op(
-        type='pow', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-    return out
 
 
 @dygraph_only
@@ -227,6 +253,8 @@ def _elementwise_op(helper):
     x = helper.kwargs.get('x', None)
     y = helper.kwargs.get('y', None)
 
+    out = helper.kwargs.get('out', None)
+
     assert x is not None, 'x cannot be None in {}'.format(original_op_type)
     assert y is not None, 'y cannot be None in {}'.format(original_op_type)
     check_variable_and_dtype(
@@ -239,11 +267,12 @@ def _elementwise_op(helper):
     axis = helper.kwargs.get('axis', -1)
     use_mkldnn = helper.kwargs.get('use_mkldnn', False)
     name = helper.kwargs.get('name', None)
-    if name is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
+
+    if out is None:
+        if name is None:
+            out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        else:
+            out = helper.create_variable(name=name, dtype=x.dtype, persistable=False)
 
     helper.append_op(
         type=op_type,
@@ -1582,11 +1611,15 @@ def clip(x, min=None, max=None, name=None):
             # [[4.5, 6.4]
     """
 
-    assert min is not None or max is not None, "either min or max should be defined."
+    np_dtype = np.float32
+    if x.dtype == VarDesc.VarType.FP64:
+        np_dtype = np.float64
+    fmin = float(np.finfo(np_dtype).min)
+    fmax = float(np.finfo(np_dtype).max)
 
     if in_dygraph_mode():
-        min = sys.float_info.min if min is None else min
-        max = sys.float_info.max if max is None else max
+        min = fmin if min is None else min
+        max = fmax if max is None else max
         return core.ops.clip(x, "min", min, "max", max)
 
     if min is not None:
@@ -1600,10 +1633,10 @@ def clip(x, min=None, max=None, name=None):
             check_dtype(max.dtype, 'max', ['float32', 'float64', 'int32'],
                         'clip', '(When the type of max in clip is Variable.)')
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'clip')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'clip')
 
     inputs = {'X': x}
-    attrs = {'min': sys.float_info.min, 'max': sys.float_info.max}
+    attrs = {'min': fmin, 'max': fmax}
 
     if isinstance(min, Variable):
         min.stop_gradient = True
diff --git a/python/requirements.txt b/python/requirements.txt
index 28c84c1d630a66077a737a48bb8a26200ec48f0b..5e81ec680897024e7c32d193bef1716e9b25b4a4 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -23,3 +23,4 @@ objgraph
 astor
 pathlib
 netifaces
+psutil
diff --git a/tools/summary_env.py b/tools/summary_env.py
index 0252d9adcd07255e69a1abd81c7704eda02745b8..39d6acaf536c533a218d3d53b596c469ab19922d 100644
--- a/tools/summary_env.py
+++ b/tools/summary_env.py
@@ -55,7 +55,7 @@ def get_os_info():
     else:
         plat = None
         ver = None
-    envs['os_info'] = "{} {}".format(plat, ver)
+    envs['os_info'] = "{0} {1}".format(plat, ver)
 
 
 def get_python_info():
@@ -93,7 +93,7 @@ def get_cudnn_info():
         if cudnn_dll_path:
             cudnn_header_path = cudnn_dll_path.split('bin')[
                 0] + 'include\cudnn.h'
-            cmd = 'type "{}" | findstr "{}" | findstr /v "CUDNN_VERSION"'
+            cmd = 'type "{0}" | findstr "{1}" | findstr /v "CUDNN_VERSION"'
         else:
             envs['cudnn_version'] = None
             return
@@ -102,7 +102,7 @@ def get_cudnn_info():
             'whereis "cudnn.h" | awk \'{print $2}\'')
         if cudnn_header_path:
             cudnn_header_path = cudnn_header_path.strip()
-            cmd = 'cat "{}" | grep "{}" | grep -v "CUDNN_VERSION"'
+            cmd = 'cat "{0}" | grep "{1}" | grep -v "CUDNN_VERSION"'
         else:
             envs['cudnn_version'] = None
             return
@@ -112,7 +112,7 @@ def get_cudnn_info():
     patch_level = _get_cudnn_ver(
         cmd.format(cudnn_header_path, 'CUDNN_PATCHLEVEL'))
 
-    envs['cudnn_version'] = "{}.{}.{}".format(major, minor, patch_level)
+    envs['cudnn_version'] = "{0}.{1}.{2}".format(major, minor, patch_level)
 
 
 def get_driver_info():
@@ -132,7 +132,7 @@ def main():
     get_cuda_info()
     get_cudnn_info()
     get_driver_info()
-    print(envs_template.format(**envs))
+    print('*' * 40 + envs_template.format(**envs) + '*' * 40)
 
 
 if __name__ == '__main__':
diff --git a/tools/wlist.json b/tools/wlist.json
index c6114918e5932a9cfd139fd0212698c5ea97d3cc..20f6a9cbaedb391995b3757612ec24f2061a8a81 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -148,7 +148,20 @@
         "Callback.on_eval_batch_end",
         "Callback.on_test_batch_begin",
         "Callback.on_test_batch_end",
-        "Model.prepare"
+        "Model.prepare",
+        "SimpleRNNCell",
+        "SimpleRNNCell.forward",
+        "LSTMCell",
+        "LSTMCell.forward",
+        "GRUCell",
+        "GRUCell.forward",
+        "SimpleRNN",
+        "GRU",
+        "LSTM",
+        "RNN",
+        "BiRNN",
+        "RNNCellBase",
+        "RNNCellBase.get_initial_states"
     ],
     "wlist_no_op_pass":[
         "gelu",
@@ -234,6 +247,7 @@
         "prroi_pool"
     ],
     "wlist_temp":[
+        "to_tensor",
         "ChunkEvaluator",
         "EditDistance",
         "ErrorClipByValue",