diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index b7a93cd9ee2160090c0142d62d96da72e4c58717..6f4671c13a9e3dccb9be0a06f4bc2453af94bd55 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -16,6 +16,7 @@ else()
   set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
   set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
   set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
+  set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
 endif()
 
 ######################################################################################
@@ -188,6 +189,10 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
 endif()
 
 add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"")
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 9d5c0cc7048f7db539c090d28c6184ac6d72d75a..bb5e2e1369a8478b500572106f9d11dff12e0189 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -272,7 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib
 
 cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
-cc_library(generator SRCS generator.cc)
+cc_library(generator SRCS generator.cc DEPS enforce place)
 
 # Get the current working branch
 execute_process(
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 3cea7a66d01051824a1de01d62c237636771804b..f757e244e38ec965d62d673e63ed082ca70c63c7 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -116,6 +116,8 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
       return platform::to_void_cast(tensor.data<unsigned char>());
     case mkldnn::memory::data_type::s32:
       return platform::to_void_cast(tensor.data<int32_t>());
+    case mkldnn::memory::data_type::bf16:
+      return platform::to_void_cast(tensor.data<paddle::platform::bfloat16>());
     default:
       PADDLE_THROW(
           platform::errors::InvalidArgument("Wrong mkldnn type provided."));
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 6eb84ef9d7c01b589cc95a78ea9727a81f6dc36e..b92c47c2eb018603e1b3156921fb2c1702864c57 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -61,7 +61,8 @@ inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) {
       {DataTypeTrait<float>::DataType(), MKLDNNDataType::f32},
       {DataTypeTrait<int8_t>::DataType(), MKLDNNDataType::s8},
       {DataTypeTrait<uint8_t>::DataType(), MKLDNNDataType::u8},
-      {DataTypeTrait<int32_t>::DataType(), MKLDNNDataType::s32}};
+      {DataTypeTrait<int32_t>::DataType(), MKLDNNDataType::s32},
+      {DataTypeTrait<platform::bfloat16>::DataType(), MKLDNNDataType::bf16}};
   auto iter = dict.find(static_cast<int>(type));
   if (iter != dict.end()) return iter->second;
   return MKLDNNDataType::undef;
@@ -74,6 +75,9 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                                const OpKernelType& expected_kernel_type,
                                const Tensor& in, Tensor* out);
+
+void* GetDataFromTensor(const Tensor& tensor, MKLDNNDataType type);
+
 #endif
 
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
index a0d08826b854fea9256382f0e065fd59dda8c8b3..8dfad23db65178c46140b887811846e413bebd00 100644
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -43,3 +43,17 @@ TEST(DataTransform, DataLayoutFunction) {
   EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC);
   EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2}));
 }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(DataTransform, GetDataFromTensorDNNL) {
+  auto place = paddle::platform::CPUPlace();
+  paddle::framework::Tensor in = paddle::framework::Tensor();
+  in.mutable_data<paddle::platform::bfloat16>(
+      paddle::framework::make_ddim({2, 3, 1, 2}), place);
+
+  void* in_data =
+      paddle::framework::GetDataFromTensor(in, dnnl::memory::data_type::bf16);
+  EXPECT_EQ(in_data, paddle::platform::to_void_cast(
+                         in.data<paddle::platform::bfloat16>()));
+}
+#endif
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index f479d92483c1c39a0b43e0d8c514237bf89bcc00..8188d5cde1b90436d040e8b9dcc1070ac85bf319 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -18,6 +18,7 @@
 #include <unordered_map>
 
 using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 2c4a7b4d02727437742b19cc6d51e209e4346d03..720e422e114835f367317d4ba265254856885c15 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -36,15 +38,16 @@ struct DataTypeTrait<void> {
 #define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
   callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);
 
-#define _ForEachDataType_(callback)                                     \
-  _ForEachDataTypeHelper_(callback, float, FP32);                       \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \
-  _ForEachDataTypeHelper_(callback, double, FP64);                      \
-  _ForEachDataTypeHelper_(callback, int, INT32);                        \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64);                    \
-  _ForEachDataTypeHelper_(callback, bool, BOOL);                        \
-  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                    \
-  _ForEachDataTypeHelper_(callback, int16_t, INT16);                    \
+#define _ForEachDataType_(callback)                                      \
+  _ForEachDataTypeHelper_(callback, float, FP32);                        \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);  \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \
+  _ForEachDataTypeHelper_(callback, double, FP64);                       \
+  _ForEachDataTypeHelper_(callback, int, INT32);                         \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                     \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                         \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                     \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);                     \
   _ForEachDataTypeHelper_(callback, int8_t, INT8)
 
 #define _ForEachDataTypeSmall_(callback)           \
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 2a380201f297f42dd82a6809bef9a72660066819..331596da33acc151810cd616ea6d5bdcae333b30 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -38,3 +38,25 @@ TEST(DataType, float16) {
   std::string type = "::paddle::platform::float16";
   EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
 }
+
+TEST(DataType, bfloat16) {
+  using paddle::framework::Tensor;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::bfloat16;
+  namespace f = paddle::framework;
+  f::proto::VarType::Type dtype = f::proto::VarType::BF16;
+
+  Tensor tensor;
+  CPUPlace cpu;
+  tensor.mutable_data(cpu, dtype);
+
+  // test bf16 tensor
+  EXPECT_EQ(tensor.type(), f::ToDataType(typeid(bfloat16)));
+
+  // test bf16 size
+  EXPECT_EQ(f::SizeOfType(dtype), 2u);
+
+  // test debug info
+  std::string type = "::paddle::platform::bfloat16";
+  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
+}
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 44542f05d9d5c92f58a84dc2be59782bae2ff3aa..3d56152c237695126d2eecb0c51ebd964a85a690 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -77,6 +77,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
       framework::VisitDataType(dst_type,
                                CastDataType<platform::float16>(in, out, ctx));
       break;
+    case proto::VarType::BF16:
+      framework::VisitDataType(dst_type,
+                               CastDataType<platform::bfloat16>(in, out, ctx));
+      break;
     case proto::VarType::FP32:
       framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
       break;
diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc
index bbebea9f13fd37469a0e9b7be9719aca128f5687..ea7a665bcbe02ff382f1b3bf04ce177a674483c9 100644
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -24,6 +24,11 @@ TEST(DataTypeTransform, CPUTransform) {
       paddle::framework::DataLayout::kAnyLayout,
       paddle::framework::LibraryType::kPlain);
 
+  auto kernel_bf16 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BF16, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
   auto kernel_fp32 = paddle::framework::OpKernelType(
       paddle::framework::proto::VarType::FP32, place,
       paddle::framework::DataLayout::kAnyLayout,
@@ -189,4 +194,120 @@ TEST(DataTypeTransform, CPUTransform) {
                 static_cast<paddle::platform::float16>(in_data_bool[i]).x);
     }
   }
+
+  // data type transform from/to bfloat16
+  {
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor out;
+
+    paddle::platform::bfloat16* ptr =
+        in.mutable_data<paddle::platform::bfloat16>(
+            paddle::framework::make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i;
+    }
+
+    // transform from bfloat16 to other data types
+    paddle::framework::TransDataType(kernel_bf16, kernel_fp32, in, &out);
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_int64, in, &out);
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_bool, in, &out);
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+
+    // transform float to bfloat16
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_fp32, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_float[i]).x);
+    }
+
+    // transform double to bfloat16
+    double* in_data_double =
+        in.mutable_data<double>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_fp64, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_double[i]).x);
+    }
+
+    // transform int to bfloat16
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_int32, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_int[i]).x);
+    }
+
+    // transform int64 to bfloat16
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_int64, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_int64[i]).x);
+    }
+
+    // transform bool to bfloat16
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_bool, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_bool[i]).x);
+    }
+  }
 }
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 956b099e883f9ea6d96db8716cb0fa693a3796d4..0ad84f5890acaf1c793000859ed3fbc7c1fc22d3 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -167,6 +167,8 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num,
 // more detail see: 180 page of
 // https://www.openmp.org/wp-content/uploads/OpenMP4.0.0.pdf
 #pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in)
+#pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \
+                              omp_in)
 #endif
 
 template <typename T>
@@ -205,6 +207,21 @@ void CheckNanInf<paddle::platform::float16>(
     PrintNanInf(value, numel, print_num, op_type, var_name);
   }
 }
+
+template <>
+void CheckNanInf<paddle::platform::bfloat16>(
+    const paddle::platform::bfloat16* value, const size_t numel, int print_num,
+    const std::string& op_type, const std::string& var_name) {
+  float sum = 0.0f;
+#pragma omp parallel for reduction(+ : sum)
+  for (size_t i = 0; i < numel; ++i) {
+    sum += static_cast<float>(value[i] - value[i]);
+  }
+
+  if (std::isnan(sum) || std::isinf(sum)) {
+    PrintNanInf(value, numel, print_num, op_type, var_name);
+  }
+}
 #endif
 
 template <>
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 180b33d0cb72e2c4c9e6e8caff9f0ef5f1b04689..915589b3242b7d5675e630aca7310185fd109ec2 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -23,6 +23,7 @@ template <typename T>
 static ::DLDataType GetDLDataTypeCode() {
   ::DLDataType dtype;
   if (std::is_same<T, platform::float16>::value ||
+      std::is_same<T, platform::bfloat16>::value ||
       std::is_floating_point<T>::value) {
     dtype.code = kDLFloat;
   } else if (std::is_unsigned<T>::value) {
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index 9bde9e20b19a0b14ce4489b91d9ab3d5273f7f9a..d51e97d98e902a87cd2a44d2019e93e8dfc30fc8 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -21,10 +21,46 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
 
+const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
+#ifdef PADDLE_WITH_CUDA
+
+  static int64_t num_cuda_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> cuda_device_flags;
+  static std::vector<std::shared_ptr<Generator>> default_cuda_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_cuda_devices = paddle::platform::GetCUDADeviceCount();
+    cuda_device_flags.resize(num_cuda_devices);
+    default_cuda_generators.resize(num_cuda_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "cuda device id shoule be greater than 0"));
+  }
+
+  std::call_once(cuda_device_flags[device_id], [device_id]() {
+    default_cuda_generators[device_id] =
+        std::make_shared<Generator>(GetRandomSeed(), device_id);
+    VLOG(4) << "initial seed: "
+            << default_cuda_generators[device_id]->GetCurrentSeed();
+  });
+  return default_cuda_generators[device_id];
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "getDefaultCUDAGenerator only support in CUDA place"));
+#endif
+}
+
 const std::shared_ptr<Generator>& DefaultCPUGenerator() {
   static auto default_cpu_generator =
       std::make_shared<Generator>(GetRandomSeed());
@@ -103,6 +139,7 @@ uint64_t Generator::Seed() {
 void Generator::SetCurrentSeed(uint64_t seed) {
   std::lock_guard<std::mutex> lock(this->mu_);
   this->state_.current_seed = seed;
+  this->state_.thread_offset = 0;
   std::seed_seq seq({seed});
   this->engine_->seed(seq);
 }
@@ -123,6 +160,22 @@ uint64_t Generator::Random64() {
   return (*engine)();
 }
 
+std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
+    uint64_t increament_offset) {
+  uint64_t cur_offset = this->state_.thread_offset;
+#ifdef PADDLE_WITH_CUDA
+  std::lock_guard<std::mutex> lock(this->mu_);
+
+  this->state_.thread_offset += increament_offset;
+
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "Increment Offset only support in CUDA place"));
+#endif
+  return std::make_pair(static_cast<int>(this->state_.current_seed),
+                        cur_offset);
+}
+
 void Generator::SetIsInitPy(bool is_init_py) {
   this->is_init_py_ = is_init_py;
   VLOG(4) << "SetIsInitPy:" << this->is_init_py_;
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index 82b35f7ad550e770e8d10457ddf6cdf8e6fbd709..a279c2e4e1458293b6579b7b7cb2111e440e5d5e 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -38,6 +38,7 @@ static uint64_t GetRandomSeed() {
 struct GeneratorState {
   int64_t device = -1;
   uint64_t current_seed = 34342423252;
+  uint64_t thread_offset = 0;
   std::mt19937_64 cpu_engine;
 };
 
@@ -49,6 +50,7 @@ struct Generator {
     this->state_.cpu_engine = *engine;
     this->state_.device = -1;
     this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
@@ -59,11 +61,25 @@ struct Generator {
     this->state_.cpu_engine = *engine;
     this->state_.device = -1;
     this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
     this->is_init_py_ = true;  // TODO(zhiqiu): remove it in future
   }
+  Generator(uint64_t seed, uint64_t device_id) {
+    std::seed_seq seq({seed});
+    auto engine = std::make_shared<std::mt19937_64>(seq);
+    this->state_.cpu_engine = *engine;
+    this->state_.device = device_id;
+    this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
+    this->engine_ = engine;
+    VLOG(4) << "initial seed: " << this->state_.current_seed
+            << ", cpu engine: " << &this->state_.cpu_engine;
+    this->is_init_py_ = false;  // TODO(zhiqiu): remove it in future
+  }
+
   Generator(const Generator& other) = delete;
 
   // get random state
@@ -83,8 +99,11 @@ struct Generator {
 
   uint64_t Random64();
 
+  std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t increament_offset);
+
   void SetIsInitPy(bool);
   bool GetIsInitPy() const;
+  uint64_t get_device_id() { return this->state_.device; }
 
  private:
   GeneratorState state_;
@@ -105,5 +124,8 @@ std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine();
 
 std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t);
 
+const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(
+    int64_t device_id = -1);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 40e01c75bb99157aedccd0692d7410b99393c009..198107ea082dc86d9e65a926bf9befe2fc4abfa4 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -615,6 +615,16 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
     GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
                               multihead_pattern);
 
+    // If weights or biases in qkv's fc are shared by multiple multihead_matmul
+    // patterns, we do not support this kind of fusion, this pass will not take
+    // effect.
+    bool is_fc_params_shared =
+        mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 ||
+        mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 ||
+        eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1;
+    if (is_fc_params_shared) {
+      return;
+    }
     fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
                  mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
                  reshape2_0, reshape2_qkv_out, scale, scale_out);
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 6fbf880356c541e72cae6f3b03efe017042254ff..9eb8478515727cf04f9d16e9a38a8f4c3ec9c683 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -90,32 +90,6 @@ void MemoryOptimizePass::CollectLifeCycle(
   }
 }
 
-// TODO(Superjomn) Make this a general help method.
-int DataTypeToSpace(framework::proto::VarType_Type type) {
-  switch (type) {
-    case framework::proto::VarType_Type_BOOL:
-      return sizeof(bool);
-    case framework::proto::VarType_Type_FP32:
-      return sizeof(float);
-    case framework::proto::VarType_Type_INT32:
-      return sizeof(int32_t);
-    case framework::proto::VarType_Type_INT64:
-      return sizeof(int64_t);
-    case framework::proto::VarType_Type_INT16:
-      return sizeof(int16_t);
-    case framework::proto::VarType_Type_FP16:
-      return sizeof(int16_t);
-    case framework::proto::VarType_Type_FP64:
-      return sizeof(double);
-    case framework::proto::VarType_Type_UINT8:
-      return sizeof(unsigned char);
-    case framework::proto::VarType_Type_INT8:
-      return sizeof(int8_t);
-    default:
-      PADDLE_THROW("Unknown data type");
-  }
-}
-
 void MemoryOptimizePass::CollectVarMemorySize(
     space_table_t* space_table) const {
   const int fake_batch_size = 1;
@@ -163,7 +137,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
       int size = std::accumulate(shape.begin(), shape.end(), 1,
                                  std::multiplies<int>());
       (*space_table)[node->Var()->Name()] =
-          size * DataTypeToSpace(node->Var()->GetDataType());
+          size * paddle::framework::SizeOfType(node->Var()->GetDataType());
     }
   }
 }
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index da5d7411693c92eaa2066c7f76d56970f8939bc7..a58b510ecf16a4bb2e2be9f4c2946a550ea20d2d 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -73,7 +73,7 @@ class PD_INFER_DECL Tensor {
 
 class PD_INFER_DECL Predictor {
  public:
-  Predictor() = default;
+  Predictor() = delete;
   ~Predictor() {}
   // Use for clone
   explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
diff --git a/paddle/fluid/inference/lite/test_engine.cc b/paddle/fluid/inference/lite/test_engine.cc
index 325c7ab2539f28f5145ee88a1bbf374f333348e1..d29bcb76be78f151dc606d9f335e9df9ed19b16b 100644
--- a/paddle/fluid/inference/lite/test_engine.cc
+++ b/paddle/fluid/inference/lite/test_engine.cc
@@ -14,15 +14,16 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/inference/lite/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/operators/lite/ut_helper.h"
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 
+#include "paddle/fluid/inference/lite/engine.h"
+#include "paddle/fluid/operators/lite/ut_helper.h"
+
 namespace paddle {
 namespace inference {
 namespace lite {
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index ac05b08b8f2a038234e7192f47a37b3ef3bcf461..6dd13d32e6e25f1657f351ff3a54562435b098f3 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -125,7 +125,7 @@ endfunction()
 if(NOT APPLE AND WITH_MKLML)
     # RNN1
     set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
-    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
+    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
     
     # seq_pool1
@@ -210,7 +210,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
 
 # transformer, the dataset only works on batch_size=8 now
 set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
-download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz")
+download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
   EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
   ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
@@ -219,7 +219,7 @@ inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_test
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz)
-    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/ocr.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
 
@@ -235,7 +235,7 @@ set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysi
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
 if (NOT EXISTS ${MOBILENET_INSTALL_DIR}/mobilenet.tar.gz)
-    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
+    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/mobilenet.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
 
@@ -363,9 +363,9 @@ if(WITH_MKLDNN)
   inference_analysis_api_test_build(${QUANT_IMG_CLASS_TEST_APP} ${QUANT_IMG_CLASS_TEST_APP_SRC})
 
   # MobileNetV1 FP32 vs. Quant INT8
-  # The FP32 model should already be downloaded for slim Quant unit tests
   set(QUANT2_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2")
   set(QUANT2_INT8_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8")
+  download_quant_data(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz")
   download_quant_data(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz")
   inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH})
 
@@ -477,9 +477,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
     endif()
 
-    inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
+    # disable test_trt_dynamic_shape_ernie_ser_deser temporary
+    #inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+    #        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+    #        ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
 
 endif()
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
index e24706691ed834ac4f49d924162035ec565d24ea..d76799a679cbf27700c6d9af4f2e2e50c5e33e35 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -44,7 +44,7 @@ void zero_copy_run() {
   const int channels = 3;
   const int height = 318;
   const int width = 318;
-  float input[batch_size * channels * height * width] = {0};
+  float *input = new float[batch_size * channels * height * width]();
 
   int shape[4] = {batch_size, channels, height, width};
   int shape_size = 4;
@@ -65,6 +65,7 @@ void zero_copy_run() {
 
   PD_PredictorZeroCopyRun(config, inputs, in_size, &outputs, &out_size);
 
+  delete[] input;
   delete[] inputs;
   delete[] outputs;
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
index 1faffacebcfdb173b96815a6ad223f06ea69c07f..c6a898dc2f315a67e3693abd73f481b08cac414a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
@@ -112,7 +112,11 @@ TEST(Analyzer_resnet50, compare_determine) {
 TEST(Analyzer_resnet50, save_optim_model) {
   AnalysisConfig cfg;
   std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
+#ifdef _WIN32
+  _mkdir(optimModelPath.c_str());
+#else
   mkdir(optimModelPath.c_str(), 0777);
+#endif
   SetConfig(&cfg);
   SaveOptimModel(&cfg, optimModelPath);
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 36e07d5f55600dc7aa96227289f707fb19f92d56..2a862b1395c222cf6d23216c9d4cf9196ffb519c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -123,7 +123,7 @@ void profile(bool memory_load = false) {
     size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size, 0);
     int64_t *result = static_cast<int64_t *>(output[0].data.data());
-    for (size_t i = 0; i < std::min(11UL, size); i++) {
+    for (size_t i = 0; i < std::min<size_t>(11, size); i++) {
       EXPECT_EQ(result[i], chinese_ner_result_data[i]);
     }
   }
diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
index c5610961d65832b455d56c3d5dcc87d9a375f6b9..9f3a389ea344e7e827c5864dff70a1b0eec10f08 100644
--- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
@@ -23,7 +23,7 @@ from PIL import Image
 import math
 from paddle.dataset.common import download
 import tarfile
-import StringIO
+from six.moves import StringIO
 import argparse
 
 random.seed(0)
@@ -152,7 +152,7 @@ def convert_Imagenet_tar2bin(tar_file, output_file):
 
         idx = 0
         for imagedata in dataset.values():
-            img = Image.open(StringIO.StringIO(imagedata))
+            img = Image.open(StringIO(imagedata))
             img = process_image(img)
             np_img = np.array(img)
             ofs.write(np_img.astype('float32').tobytes())
diff --git a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
index 8a098aa1eb4875b9cf016ea649f90c5beb511d79..84c4eb7e5e87ee36692e25c70a93cbc32082db45 100644
--- a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
@@ -19,7 +19,7 @@ import os
 import sys
 from paddle.dataset.common import download
 import tarfile
-import StringIO
+from six.moves import StringIO
 import hashlib
 import tarfile
 import argparse
@@ -191,7 +191,7 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):
                 gt_labels[name_prefix] = tar.extractfile(tarInfo).read()
 
     for line_idx, name_prefix in enumerate(lines):
-        im = Image.open(StringIO.StringIO(images[name_prefix]))
+        im = Image.open(StringIO(images[name_prefix]))
         if im.mode == 'L':
             im = im.convert('RGB')
         im_width, im_height = im.size
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 8bc10f2147fa29102b242ce22e78a88453d6cee4..9bde2a99db1b75a454b005eec2d237294c7aa815 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -25,7 +25,8 @@ endfunction()
 
 function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
   message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
-  string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
+  string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME})
+  string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME})
   set(EXTERNAL_PROJECT_NAME "extern_inference_download_${FILENAME_EX}")
   set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
   ExternalProject_Add(
@@ -38,7 +39,7 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
       DOWNLOAD_NO_PROGRESS  1
       CONFIGURE_COMMAND     ""
       BUILD_COMMAND         ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
-                            ${CMAKE_COMMAND} -E tar xzf ${FILENAME}
+                            ${CMAKE_COMMAND} -E tar xzf ${DOWNLOAD_NAME}
       UPDATE_COMMAND        ""
       INSTALL_COMMAND       ""
   )
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
index 7aaaa0002c5ab31af72c75e69f5a283c09633ba4..58b56bdcf5614ed9183ce3bf11c1767f92650d20 100644
--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -62,11 +62,11 @@ __global__ void affine_grid_kernel(const int count, int n, int out_h, int out_w,
 
     int theta_offset = n * 6;  // 2 * 3;
     // affine from (h_coor, w_coor) to (x, y)
-    output[index * 2] = theta[theta_offset] * h_coor +
-                        theta[theta_offset + 1] * w_coor +
+    output[index * 2] = theta[theta_offset] * w_coor +
+                        theta[theta_offset + 1] * h_coor +
                         theta[theta_offset + 2];
-    output[index * 2 + 1] = theta[theta_offset + 3] * h_coor +
-                            theta[theta_offset + 4] * w_coor +
+    output[index * 2 + 1] = theta[theta_offset + 3] * w_coor +
+                            theta[theta_offset + 4] * h_coor +
                             theta[theta_offset + 5];
   }
 }
@@ -86,13 +86,13 @@ __global__ void affine_grid_grad_kernel(const int count, int n, int out_h,
 
     int theta_offset = n * 6;  // 2 * 3;
     T out_grad_x = out_grad[index * 2];
-    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * h_coor);
     platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
 
     T out_grad_y = out_grad[index * 2 + 1];
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * h_coor);
     platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
   }
 }
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index 69365357084b660b7c2f90149fe250854ea6a014..c296ddcfbef703e8484b6ea0b7f96f037e415186 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -166,10 +166,22 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
 
+    const int& dtype = ctx->Attrs().Get<int>("dtype");
+    PADDLE_ENFORCE_EQ(
+        (dtype < 0 || dtype == 2 || dtype == 3), true,
+        platform::errors::InvalidArgument(
+            "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
+            "received [%s]",
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT32),
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT64),
+            paddle::framework::DataTypeToString(
+                static_cast<framework::proto::VarType::Type>(dtype))));
+
     auto x_rank = x_dims.size();
     if (axis < 0) axis += x_rank;
     if (ctx->IsRuntime()) {
-      const int& dtype = ctx->Attrs().Get<int>("dtype");
       if (dtype == framework::proto::VarType::INT32) {
         int64_t all_element_num = 0;
         if (flatten) {
diff --git a/paddle/fluid/operators/bernoulli_op.cu b/paddle/fluid/operators/bernoulli_op.cu
index f665d2dd0e991847de2ad35bf6b18741fb3a6e26..6565f5a9a2176972e9e5085c6646097e8349f259 100644
--- a/paddle/fluid/operators/bernoulli_op.cu
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 
-#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/bernoulli_op.h"
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index 7081490fd1bf0e26cb8aa90d69a76a5476cef044..cc807f193ed835cfbf04dfcefad7ffb24e8ab286 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -37,41 +37,42 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("LastC"), "Output", "LastC", "CudnnLSTM");
 
     auto in_dims = ctx->GetInputDim("Input");
-    auto init_dims = ctx->GetInputDim("InitH");
+    auto init_h_dims = ctx->GetInputDim("InitH");
+    auto init_c_dims = ctx->GetInputDim("InitC");
+
     PADDLE_ENFORCE_EQ(in_dims.size(), 3,
                       platform::errors::InvalidArgument(
                           "The rank of Input in CudnnLSTM  must be 3. But "
                           "received Input's rank is %d.",
                           in_dims.size()));
-    PADDLE_ENFORCE_EQ(init_dims.size(), 3,
+    PADDLE_ENFORCE_EQ(init_h_dims.size(), 3,
                       platform::errors::InvalidArgument(
                           "The rank of InitH in CudnnLSTM  must be 3. But "
                           "received InitH's rank is %d.",
-                          init_dims.size()));
+                          init_h_dims.size()));
 
-    PADDLE_ENFORCE_EQ(in_dims[1], init_dims[1],
-                      platform::errors::InvalidArgument(
-                          "The in_dims[1] (Input dims) and init_dims[1] (InitH "
-                          "dims) should be equal. But "
-                          "received in_dims[1] is %d and init_dims[1] is %d.",
-                          in_dims[1], init_dims[1]));
-    PADDLE_ENFORCE_EQ(in_dims[2], init_dims[2],
+    PADDLE_ENFORCE_EQ(
+        in_dims[1], init_h_dims[1],
+        platform::errors::InvalidArgument(
+            "The in_dims[1] (Input dims) and init_h_dims[1] (InitH "
+            "dims) should be equal. But "
+            "received in_dims[1] is %d and init_h_dims[1] is %d.",
+            in_dims[1], init_h_dims[1]));
+
+    PADDLE_ENFORCE_EQ(init_c_dims, init_h_dims,
                       platform::errors::InvalidArgument(
-                          "The in_dims[2] (Input dims) and init_dims[2] (InitH "
-                          "dims) should be equal. But "
-                          "received in_dims[2] is %d and init_dims[2] is %d.",
-                          in_dims[2], init_dims[2]));
+                          "The InitC dims and InitH "
+                          "dims should be equal. But "
+                          "received init_c_dims is %d and init_h_dims is %d.",
+                          init_c_dims, init_h_dims));
 
     auto out_dims = in_dims;
     auto hidden_size = ctx->Attrs().Get<int>("hidden_size");
     bool is_bidirec = ctx->Attrs().Get<bool>("is_bidirec");
     out_dims[2] = is_bidirec ? hidden_size * 2 : hidden_size;
-
-    auto last_dims = init_dims;
-    last_dims[0] = is_bidirec ? last_dims[0] * 2 : last_dims[0];
     ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("LastH", last_dims);
-    ctx->SetOutputDim("LastC", last_dims);
+    ctx->SetOutputDim("LastH", init_c_dims);
+    ctx->SetOutputDim("LastC", init_h_dims);
   }
 
  protected:
@@ -95,7 +96,7 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
         "different batch)"
         "batch_size is the instance number of this batch"
         "input_size is the hidden size of the input."
-        "input_hidden_size and the hidden_size in the next may not be same");
+        "input_size and the hidden_size in the next may not be same");
     AddInput("InitH",
              "(Tensor) the initial hidden state of the LSTM"
              "input. This is a tensor with shape (num_layers x batch_size x "
@@ -154,6 +155,13 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(1);
     AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
     AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
+    AddAttr<std::vector<int>>("sequence_length",
+                              "(vector<int>) When the input data is padding, "
+                              "set this parameter. This parameter represents "
+                              "the variable sequence"
+                              "lengths in a batch. The size of the vector has "
+                              "to equal the batch_size.")
+        .SetDefault({});
     AddComment(R"DOC(
 CUDNN LSTM implementation
 
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 37e5e518ea2af9bb437775c8fa7e86816bb1d8ae..f60cd41d9a218c444254d268eb43abfb97db43e6 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -55,50 +56,96 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int num_layers = ctx.Attr<int>("num_layers");
     bool is_test = ctx.Attr<bool>("is_test");
     int seed = ctx.Attr<int>("seed");
+    auto sequence_length = ctx.Attr<std::vector<int>>("sequence_length");
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
 
-    CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
+    int seq_length = x->dims()[0];
+    int batch_size = x->dims()[1];
+    int input_size = x->dims()[2];
+    int weight_numel = w->numel();
+    bool state_initialized = state_out->IsInitialized() ? true : false;
 
-    auto input_w_numel = w->numel();
-    auto seq_len = x->dims()[0];
-    auto batch_size = x->dims()[1];
-    auto input_dim = x->dims()[2];
+    size_t workspace_size;
     size_t reserve_size;
-    bool state_initialized = state_out->IsInitialized() ? true : false;
-    cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
-        framework::ToDataType(std::type_index(typeid(T))));
-    cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
-                          input_dim, hidden_size, num_layers, dropout_prob,
-                          is_bidirec, seed, input_w_numel, &reserve_size,
-                          state_out, state_initialized, cudnn_type);
+
+    platform::ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
+                                num_layers, dropout_prob, seed, weight_numel,
+                                state_initialized, is_bidirec);
+    rnn.Create<T>(handle, ctx.GetPlace(), sequence_length, &workspace_size,
+                  &reserve_size, state_out);
+
+    framework::Tensor workspace_data_;
+    workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
+    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
 
     auto *reserve_data = reserve->mutable_data<uint8_t>(
         {static_cast<int64_t>(reserve_size)}, ctx.GetPlace());
 
     if (is_test) {
-      // for inference
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
-          handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
-          x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
-          cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
-          w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
-          last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
-          cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-          cudnn_rnn_cache->workspace_size_));
+      if (sequence_length.empty()) {
+        // for inference
+        // This interface is used when the input/output is unpadded.
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
+            handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), x_data,
+            rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
+            rnn.w_desc(), w_data, rnn.y_desc(), out_data, rnn.hy_desc(),
+            last_h_data, rnn.cy_desc(), last_c_data,
+            workspace_data_.data<uint8_t>(), workspace_size));
+      } else {
+#if CUDNN_VERSION >= 7201
+        // for inference
+        // This interface is used when the input/output is padded.
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            platform::dynload::cudnnRNNForwardInferenceEx(
+                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.hx_desc(),
+                init_h_data, rnn.cx_desc(), init_c_data, rnn.w_desc(), w_data,
+                rnn.y_seq_desc(), out_data, rnn.hy_desc(), last_h_data,
+                rnn.cy_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr,
+                workspace_data_.data<uint8_t>(), workspace_size));
+#else
+        PADDLE_ENFORCE_NOT_NULL(
+            nullptr, platform::errors::Unavailable(
+                         "The padded input is supported by "
+                         "cudnnRNNForwardInferenceEx, but it only works when "
+                         "the version of cudnn is larger than 7.2.1"));
+#endif
+      }
     } else {
-      // for train
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
-          handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
-          x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
-          cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
-          w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
-          last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
-          cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-          cudnn_rnn_cache->workspace_size_, reserve_data, reserve_size));
+      if (sequence_length.empty()) {
+        // for train
+        // This interface is used when the input/output is unpadded.
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
+            handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), x_data,
+            rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
+            rnn.w_desc(), w_data, rnn.y_desc(), out_data, rnn.hy_desc(),
+            last_h_data, rnn.cy_desc(), last_c_data,
+            workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
+            reserve_size));
+      } else {
+#if CUDNN_VERSION >= 7201
+        // for train
+        // This interface is used when the input/output is padded.
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            platform::dynload::cudnnRNNForwardTrainingEx(
+                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.hx_desc(),
+                init_h_data, rnn.cx_desc(), init_c_data, rnn.w_desc(), w_data,
+                rnn.y_seq_desc(), out_data, rnn.hy_desc(), last_h_data,
+                rnn.cy_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr,
+                workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
+                reserve_size));
+#else
+        PADDLE_ENFORCE_NOT_NULL(
+            nullptr, platform::errors::Unavailable(
+                         "The padded input is supported by "
+                         "cudnnRNNForwardTrainingEx, but it only works when "
+                         "the version of cudnn is larger than 7.2.1"));
+#endif
+      }
     }
-    delete cudnn_rnn_cache;
   }
 };
 
@@ -156,44 +203,74 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     int hidden_size = ctx.Attr<int>("hidden_size");
     int num_layers = ctx.Attr<int>("num_layers");
     int seed = ctx.Attr<int>("seed");
+    auto sequence_length = ctx.Attr<std::vector<int>>("sequence_length");
 
-    CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
+    int seq_length = input_dims[0];
+    int batch_size = input->dims()[1];
+    int input_size = input->dims()[2];
+    int weight_numel = weight->numel();
 
-    auto input_w_numel = weight->numel();
-    auto seq_len = input_dims[0];
-    auto batch_size = input->dims()[1];
-    auto input_dim = input->dims()[2];
+    size_t workspace_size;
     size_t reserve_size;
-    cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
-        framework::ToDataType(std::type_index(typeid(T))));
-    cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
-                          input_dim, hidden_size, num_layers, dropout_prob,
-                          is_bidirec, seed, input_w_numel, &reserve_size,
-                          const_cast<Tensor *>(state_out), true, cudnn_type);
-
-    auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>();
+
+    platform::ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
+                                num_layers, dropout_prob, seed, weight_numel,
+                                true, is_bidirec);
+
+    rnn.Create<T>(handle, ctx.GetPlace(), sequence_length, &workspace_size,
+                  &reserve_size, const_cast<Tensor *>(state_out));
+
+    framework::Tensor workspace_data_;
+    workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
+    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
     const uint8_t *reserve_data = reserve->data<uint8_t>();
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
-        handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->y_desc_,
-        out_data, cudnn_rnn_cache->y_desc_, out_grad_data,
-        cudnn_rnn_cache->hy_desc_, last_h_grad_data, cudnn_rnn_cache->cy_desc_,
-        last_c_grad_data, cudnn_rnn_cache->w_desc_, weight_data,
-        cudnn_rnn_cache->hx_desc_, init_h_data, cudnn_rnn_cache->cx_desc_,
-        init_c_data, cudnn_rnn_cache->x_desc_, in_grad_data,
-        cudnn_rnn_cache->hx_desc_, init_h_grad_data, cudnn_rnn_cache->cx_desc_,
-        init_c_grad_data, work_data, cudnn_rnn_cache->workspace_size_,
-        const_cast<uint8_t *>(reserve_data), reserve_size));
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
-        handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
-        input->data<T>(), cudnn_rnn_cache->hx_desc_, init_h->data<T>(),
-        cudnn_rnn_cache->y_desc_, out->data<T>(),
-        cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->w_desc_,
-        weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
-        reserve_size));
-    delete cudnn_rnn_cache;
+    if (sequence_length.empty()) {
+      // This interface is used when the input/output is unpadded.
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
+          handle, rnn.rnn_desc(), seq_length, rnn.y_desc(), out_data,
+          rnn.y_desc(), out_grad_data, rnn.hy_desc(), last_h_grad_data,
+          rnn.cy_desc(), last_c_grad_data, rnn.w_desc(), weight_data,
+          rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data, rnn.x_desc(),
+          in_grad_data, rnn.hx_desc(), init_h_grad_data, rnn.cx_desc(),
+          init_c_grad_data, workspace_data_.data<uint8_t>(), workspace_size,
+          const_cast<uint8_t *>(reserve_data), reserve_size));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
+          handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), input->data<T>(),
+          rnn.hx_desc(), init_h->data<T>(), rnn.y_desc(), out->data<T>(),
+          workspace_data_.data<uint8_t>(), workspace_size, rnn.w_desc(),
+          weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
+          reserve_size));
+    } else {
+#if CUDNN_VERSION >= 7201
+      // for train
+      // This interface is used when the input/output is padded.
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
+          handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data, rnn.y_seq_desc(),
+          out_grad_data, nullptr, nullptr, rnn.hy_desc(), last_h_grad_data,
+          rnn.cy_desc(), last_c_grad_data, rnn.w_desc(), weight_data,
+          rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
+          rnn.x_seq_desc(), in_grad_data, rnn.hx_desc(), init_h_grad_data,
+          rnn.cx_desc(), init_c_grad_data, nullptr, nullptr,
+          workspace_data_.data<uint8_t>(), workspace_size,
+          const_cast<uint8_t *>(reserve_data), reserve_size));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
+          handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
+          rnn.hx_desc(), init_h->data<T>(), rnn.y_seq_desc(), out->data<T>(),
+          workspace_data_.data<uint8_t>(), workspace_size, rnn.w_desc(),
+          weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
+          reserve_size));
+#else
+      PADDLE_ENFORCE_NOT_NULL(
+          nullptr,
+          platform::errors::Unavailable(
+              "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
+              "cudnnRNNBackwardWeightsEx, but it only works when the version "
+              "of cudnn is larger than 7.2.1"));
+#endif
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index c9c42e0938d51991c53b74ac6ad59c350f4a3ced..de77121ee3990366771723e3c43e53362c832ef7 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -62,6 +62,34 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
     gpu_dev_ctx.Wait();
 #else
     PADDLE_THROW("Unexpected branch");
+#endif
+    return true;
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    auto& xpu_dev_ctx = static_cast<const platform::XPUDeviceContext&>(dev_ctx);
+    platform::CPUPlace cpu;
+    char* p = reinterpret_cast<char*>(dest);
+    while (total_written < length) {
+      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+        return false;
+      }
+
+      if (total_written + size_to_write > length) {
+        size_to_write = length - total_written;
+      }
+
+      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place),
+                   reinterpret_cast<void*>(p), cpu, data, size_to_write);
+      p += size_to_write;
+      total_written += size_to_write;
+      input->Skip(size_to_write);
+    }
+    xpu_dev_ctx.Wait();
+#else
+    PADDLE_ENFORCE_NOT_NULL(
+        nullptr,
+        platform::errors::Unimplemented(
+            "Not supported XPU, please compile with option WITH_XPU=ON."));
 #endif
     return true;
   }
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 4d5e4c4f600314d307125f9b2031026b6aa94f10..49ad67bbca353acc4a79c9e8912d7ae5a70c0021 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -96,6 +96,42 @@ __global__ void RandomGeneratorWithSeed(const size_t n, const int* seed,
   }
 }
 
+template <typename T, typename MaskType>
+__global__ void RandomGeneratorWithGenerator(const size_t n, uint64_t seed,
+                                             const float dropout_prob,
+                                             const T* src, MaskType* mask_data,
+                                             T* dst, bool is_upscale_in_train,
+                                             uint64_t increment) {
+  curandStatePhilox4_32_10_t state;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = 0;
+
+  MaskType mask;
+  T dest;
+  for (; idx < n; idx += blockDim.x * gridDim.x) {
+    T s = src[idx];
+    if (step_size == 0) {
+      curand_init(seed, idx, increment, &state);
+      step_size = blockDim.x * gridDim.x;
+    } else {
+      curand_init(seed, idx, increment, &state);
+    }
+    if (curand_uniform(&state) < dropout_prob) {
+      mask = 0;
+      dest = 0;
+    } else {
+      mask = 1;
+      if (is_upscale_in_train) {
+        dest = s / static_cast<T>(1.0f - dropout_prob);
+      } else {
+        dest = s;
+      }
+    }
+    mask_data[idx] = mask;
+    dst[idx] = dest;
+  }
+}
+
 // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -150,6 +186,17 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
             context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
       }
 
+      int device_id = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace())
+                          .GetDeviceId();
+      auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+      if (gen_cuda->GetIsInitPy() && (!context.Attr<bool>("fix_seed"))) {
+        auto seed_offset = gen_cuda->IncrementOffset(1);
+        RandomGeneratorWithGenerator<T, uint8_t><<<grid, threads, 0, stream>>>(
+            size, seed_offset.first, dropout_prob, x_data, mask_data, y_data,
+            upscale_in_train, seed_offset.second);
+        return;
+      }
+
       RandomGenerator<T, uint8_t><<<grid, threads, 0, stream>>>(
           size, seed_data, dropout_prob, x_data, mask_data, y_data,
           upscale_in_train);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
index 47bd6af0b95ace2b9b753e38cfc5f191bc1bb942..87e940e2ed6319c4f2957cd846735adb210cd23d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
@@ -31,6 +31,15 @@ struct ModFunctor {
   }
 };
 
+template <typename T>
+struct InverseModFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = b % a;
+    if ((res != 0) && ((res < 0) != (a < 0))) res += a;
+    return res;
+  }
+};
+
 template <typename T>
 struct ModFunctorFP {
   inline HOSTDEVICE T operator()(T a, T b) const {
@@ -40,13 +49,29 @@ struct ModFunctorFP {
   }
 };
 
+template <typename T>
+struct InverseModFunctorFP {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = fmod(b, a);
+    if ((res != 0) && ((a < 0) != (res < 0))) res += a;
+    return res;
+  }
+};
+
 template <typename DeviceContext, typename T>
 void elementwise_mod(const framework::ExecutionContext &ctx,
                      const framework::Tensor *x, const framework::Tensor *y,
                      framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                        ModFunctor<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          ModFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseModFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseModFunctor<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
@@ -54,8 +79,15 @@ void elementwise_mod_fp(const framework::ExecutionContext &ctx,
                         const framework::Tensor *x, const framework::Tensor *y,
                         framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<ModFunctorFP<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          ModFunctorFP<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<ModFunctorFP<T>, DeviceContext, T>(
+        ctx, x, y, axis, ModFunctorFP<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseModFunctorFP<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseModFunctorFP<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index c144481f8dedc9317f7657a22ce82e56022d5b89..69c8b60040651179784cd6b77c31c66e892231be 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
@@ -24,15 +25,20 @@ template <typename T>
 struct GaussianGenerator {
   T mean_, std_;
   unsigned int seed_;
+  unsigned int offset_ = 0;
 
   __host__ __device__ GaussianGenerator(T mean, T std, int seed)
       : mean_(mean), std_(std), seed_(seed) {}
 
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
+      : mean_(mean), std_(std), seed_(seed), offset_(offset) {}
+
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed_);
     thrust::normal_distribution<T> dist(mean_, std_);
-    rng.discard(n);
+    unsigned int new_n = n + offset_;
+    rng.discard(new_n);
     return dist(rng);
   }
 };
@@ -43,9 +49,11 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
@@ -56,9 +64,23 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
     int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      GaussianGenerator<T>(mean, std, seed));
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
+    } else {
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed));
+    }
   }
 };
 
@@ -69,17 +91,33 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      GaussianGenerator<T>(mean, std, seed));
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed_offset.first,
+                                             seed_offset.second));
+    } else {
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed));
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 12733a0d9f1689a020f77d23cc31b0d19b412746..1f7dde9b931dafa4b8e0bee211e64461b1c21dc5 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -67,7 +67,7 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
         scale_tensor[0], 1,
         platform::errors::InvalidArgument(
             "Scale's shape must be 1, but got shape = %d .", scale_tensor[0]));
-    // out_w = -1;
+    out_w = -1;
   } else {
     auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
     if (scale.size() > 0) {
@@ -159,8 +159,8 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
                       platform::errors::InvalidArgument(
                           "Scale's shape must be 2 or 1, but got shape = %d .",
                           scale_tensor[0]));
-    // out_h = -1;
-    // out_w = -1;
+    out_h = -1;
+    out_w = -1;
   } else {
     auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
     if (scale.size() > 0) {
@@ -264,9 +264,9 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
                       platform::errors::InvalidArgument(
                           "Scale's shape must be 3 or 1, but got shape = %d .",
                           scale_tensor[0]));
-    // out_d = -1;
-    // out_h = -1;
-    // out_w = -1;
+    out_d = -1;
+    out_h = -1;
+    out_w = -1;
   } else {
     auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
     if (scale.size() > 0) {
@@ -633,6 +633,9 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateV2GradNoNeedBufferVarsInferer,
 }  // namespace operators
 }  // namespace paddle
 
+// interp_v2 support scale_factor whose input type is list, this operation is
+// not
+// compatible with interp_op, so a new one is added in paddle2.0
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(bilinear_interp_v2, ops::InterpolateV2Op,
                   ops::InterpolateV2OpMaker,
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 6cb8104638dea458743374014e7bef35df2dbfcc..816539c3b5fdb805d16fb8224b7c960f797613cb 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -836,12 +836,12 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_w = ctx.Attr<int>("out_w");
 
   auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1;
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
     out_w = new_size[0];
   } else {
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -887,8 +887,11 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1.0) / (out_w - 1.0)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_cw = c * in_w;
@@ -924,14 +927,14 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_w = ctx.Attr<int>("out_w");
 
   auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1;
+  float scale_h = -1;
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
     out_h = new_size[0];
     out_w = new_size[1];
   } else {
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -993,12 +996,18 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_hw = in_h * in_w;
@@ -1048,6 +1057,9 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_w = ctx.Attr<int>("out_w");
 
   auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1;
+  float scale_d = -1;
+  float scale_h = -1;
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
@@ -1055,9 +1067,6 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
     out_h = new_size[1];
     out_w = new_size[2];
   } else {
-    float scale_d = -1;
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -1129,16 +1138,25 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_dhw = in_d * in_h * in_w;
@@ -1230,8 +1248,11 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
   int in_cw = c * in_w;
   int out_cw = c * out_w;
@@ -1333,12 +1354,18 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_hw = in_h * in_w;
@@ -1464,16 +1491,25 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_dhw = in_d * in_h * in_w;
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 111766934b8300c0a7b46ae9a065b8c42460e577..4e4fd9ff63ba47b41363a81d6cc527486671d695 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -783,12 +783,13 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
 
   int out_w = ctx.Attr<int>("out_w");
   auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1.;
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
     out_w = new_size[0];
   } else {
-    float scale_w = -1;
+    // float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -833,8 +834,11 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
   if ("linear" == interp_method) {
     LinearInterpolation<T>(input, output, ratio_w, in_w, n, c, out_w,
@@ -856,6 +860,8 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
 
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
+  float scale_h = -1;
+  float scale_w = -1;
 
   auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
@@ -864,8 +870,6 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
     out_h = new_size[0];
     out_w = new_size[1];
   } else {
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -925,12 +929,18 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("bilinear" == interp_method) {
@@ -962,6 +972,10 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
 
+  float scale_d = -1;
+  float scale_h = -1;
+  float scale_w = -1;
+
   auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
@@ -970,9 +984,6 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
     out_h = new_size[1];
     out_w = new_size[2];
   } else {
-    float scale_d = -1;
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -1043,16 +1054,25 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("trilinear" == interp_method) {
@@ -1127,8 +1147,11 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
   if ("linear" == interp_method) {
     LinearInterpolationGrad<T>(output_grad, input_grad, ratio_w, in_w, n, c,
@@ -1216,12 +1239,18 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("bilinear" == interp_method) {
@@ -1327,16 +1356,25 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("trilinear" == interp_method) {
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index 8aca892a81d41b1e0a9f7f9c14169c2817ae9452..793253b6b8894de8d89b301921383ebfd53d66fc 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/linspace_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -19,6 +20,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 __global__ void LinspaceKernel(T start, double step, int64_t size, T* out) {
   CUDA_KERNEL_LOOP(index, size) {
@@ -35,15 +38,27 @@ template <typename T>
 class CUDALinspaceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<framework::Tensor>("Start");
-    auto* stop_t = context.Input<framework::Tensor>("Stop");
+    auto* pre_start = context.Input<framework::Tensor>("Start");
+    auto* pre_stop = context.Input<framework::Tensor>("Stop");
     auto* num_t = context.Input<framework::Tensor>("Num");
     auto* out = context.Output<framework::Tensor>("Out");
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+
+    Tensor start_t;
+    Tensor stop_t;
+    auto start_dtype =
+        framework::OpKernelType(pre_start->type(), context.GetPlace());
+    auto stop_dtype =
+        framework::OpKernelType(pre_stop->type(), context.GetPlace());
+    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
+    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
+    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
 
     framework::Tensor n;
-    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
+    framework::TensorCopy(start_t, platform::CPUPlace(), &n);
     T start = n.data<T>()[0];
-    framework::TensorCopy(*stop_t, platform::CPUPlace(), &n);
+    framework::TensorCopy(stop_t, platform::CPUPlace(), &n);
     T stop = n.data<T>()[0];
     framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
     int32_t num = n.data<int32_t>()[0];
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
index 9fb4960375ed7be60598d558c65310bd4a4b84bc..898f611f864dc8bfac2ba7e41b91f5f5bbe524ab 100644
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -14,20 +14,38 @@ limitations under the License. */
 
 #pragma once
 #include <functional>
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 class CPULinspaceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
-    T stop = context.Input<framework::Tensor>("Stop")->data<T>()[0];
+    auto* pre_start = context.Input<framework::Tensor>("Start");
+    auto* pre_stop = context.Input<framework::Tensor>("Stop");
     int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
     auto* out = context.Output<framework::Tensor>("Out");
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+
+    Tensor start_t;
+    Tensor stop_t;
+    auto start_dtype =
+        framework::OpKernelType(pre_start->type(), context.GetPlace());
+    auto stop_dtype =
+        framework::OpKernelType(pre_stop->type(), context.GetPlace());
+    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
+    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
+    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
+
+    T start = start_t.data<T>()[0];
+    T stop = stop_t.data<T>()[0];
     PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
 
     out->Resize(framework::make_ddim({num}));
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
index 3a5eddcbf4af699a89ae1a21571337155699a1f3..18d9a6310dd6c09905ca7fa84d98f391a84dfa2d 100644
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -65,13 +65,14 @@ class SplitFunctor {
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_ALL_TYPES(macro) \
-  macro(int);                \
-  macro(float);              \
-  macro(double);             \
-  macro(bool);               \
-  macro(int64_t);            \
-  macro(int16_t);            \
-  macro(uint8_t);            \
-  macro(int8_t);             \
-  macro(::paddle::platform::float16)
+#define FOR_ALL_TYPES(macro)          \
+  macro(int);                         \
+  macro(float);                       \
+  macro(double);                      \
+  macro(bool);                        \
+  macro(int64_t);                     \
+  macro(int16_t);                     \
+  macro(uint8_t);                     \
+  macro(int8_t);                      \
+  macro(::paddle::platform::float16); \
+  macro(::paddle::platform::bfloat16)
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 6748d0ab43f70f997b3008f34f4be743b81e8946..824e66b1eb4ae05cc74dc1cd8c21f16f286592e6 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -34,6 +34,7 @@ namespace math {
 using float16 = paddle::platform::float16;
 
 template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
 template struct SetConstant<platform::CPUDeviceContext, float>;
 template struct SetConstant<platform::CPUDeviceContext, double>;
 template struct SetConstant<platform::CPUDeviceContext, int>;
@@ -41,16 +42,18 @@ template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
 template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
 
-#define DEFINE_CPU_TRANS(RANK)                                             \
-  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
-                            RANK>;                                         \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
-  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;    \
+#define DEFINE_CPU_TRANS(RANK)                                              \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16,  \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;         \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;     \
   template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;
 
 DEFINE_CPU_TRANS(1);
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index aba32c6ec92854f3ea9248d92f7e435551ae83b0..19ee8764e27b235a2fa8e0720c11bce601b030db 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -1055,7 +1055,11 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       astream.wait();
 
       filter_grad->set_layout(DataLayout::kMKLDNN);
-      filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
+      // in OneDNN groups in convolution are treated as separate dimension
+      // which is not the case in paddlepaddle
+      auto filter_fmt = GetMKLDNNFormat(*diff_weights_memory_p);
+      filter_grad->set_format(platform::MKLDNNFormatForSize(
+          g > 1 ? weights_tz.size() - 1 : weights_tz.size(), filter_fmt));
     }
     if (input_grad) {
       auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive(
diff --git a/paddle/fluid/operators/randint_op.cu b/paddle/fluid/operators/randint_op.cu
index a07a92621e6b3726be518df6abcec58257a91489..40e390b0b87246bbaa8474262df8ba5576297385 100644
--- a/paddle/fluid/operators/randint_op.cu
+++ b/paddle/fluid/operators/randint_op.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
 
@@ -49,15 +50,23 @@ class GPURandintKernel : public framework::OpKernel<T> {
 
     int64_t size = out->numel();
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+
+    /*
     std::minstd_rand engine;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
     }
     engine.seed(seed);
+    */
+
     std::uniform_int_distribution<> dist(context.Attr<int>("low"),
                                          context.Attr<int>("high") - 1);
-    for (int64_t i = 0; i < size; ++i) data[i] = dist(engine);
+    auto engine = framework::GetCPURandomEngine(seed);
+
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
+    }
 
     if (platform::is_gpu_place(context.GetPlace())) {
       // Copy tensor to out
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
index c25e5d01b2758a96192d6fbf8f4e881770cbbbf0..c9ad1075c0c3c1c6f405144dbfde2e81b85124aa 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
 
 REGISTER_OP_CUDA_KERNEL(logsumexp,
@@ -20,8 +19,3 @@ REGISTER_OP_CUDA_KERNEL(logsumexp,
                                           float, ops::LogsumexpFunctor>,
                         ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                           double, ops::LogsumexpFunctor>);
-REGISTER_OP_CUDA_KERNEL(
-    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::LogsumexpGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::LogsumexpGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d6ad4863092a50233b806c944db0b8c161ed9dd0
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// .part used to speed up nvcc compile
+#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::LogsumexpGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::LogsumexpGradFunctor>);
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index b45fa7c791ff22be422ce12a8348a071c60ddd0f..70733d643673ad8acde9a45f273a52a9723fb0d3 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -53,7 +53,7 @@ REGISTER_OPERATOR(
     size, ops::SizeOp, ops::SizeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int32_t>,
+REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int64_t>,
                        ops::SizeKernel<paddle::platform::float16>,
                        ops::SizeKernel<float>, ops::SizeKernel<double>,
                        ops::SizeKernel<bool>);
diff --git a/paddle/fluid/operators/size_op.cu b/paddle/fluid/operators/size_op.cu
index 3ea3032693236d5618ff6f0c858cbd85e34633ab..de56ecd95270577689f699462b9273b43f34595e 100644
--- a/paddle/fluid/operators/size_op.cu
+++ b/paddle/fluid/operators/size_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 
 REGISTER_OP_CUDA_KERNEL(
     size, paddle::operators::SizeKernel<int>,
-    paddle::operators::SizeKernel<int32_t>,
+    paddle::operators::SizeKernel<int64_t>,
     paddle::operators::SizeKernel<paddle::platform::float16>,
     paddle::operators::SizeKernel<float>, paddle::operators::SizeKernel<bool>,
     paddle::operators::SizeKernel<double>);
diff --git a/paddle/fluid/operators/size_op.h b/paddle/fluid/operators/size_op.h
index fb44070897156ef88062231322e28a2db1f244a7..e8c53d6e683305bfc1ff7c052a2dc54ecf465936 100644
--- a/paddle/fluid/operators/size_op.h
+++ b/paddle/fluid/operators/size_op.h
@@ -26,8 +26,18 @@ class SizeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in_t = ctx.Input<Tensor>("Input");
     auto* out_t = ctx.Output<Tensor>("Out");
-    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
-    out_data[0] = in_t->numel();
+    auto place = ctx.GetPlace();
+    auto out_data = out_t->mutable_data<int64_t>(place);
+    auto cpu_place = platform::CPUPlace();
+    if (place == cpu_place) {
+      out_data[0] = in_t->numel();
+    } else {
+      Tensor cpu_tensor;
+      auto cpu_data =
+          cpu_tensor.mutable_data<int64_t>(out_t->dims(), cpu_place);
+      cpu_data[0] = in_t->numel();
+      TensorCopy(cpu_tensor, place, out_t);
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
index 5a3510babe4d57b9e80f0e7898df98033834ca15..a838c30771a5c1229061a58b12c6777a3d24c6f3 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include <limits>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
@@ -46,6 +47,37 @@ struct TruncatedNormal {
   }
 };
 
+template <typename T>
+struct TruncatedNormalOffset {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  unsigned int seed;
+  T numeric_min;
+  int offset_;
+
+  __host__ __device__ TruncatedNormalOffset(T mean, T std, T numeric_min,
+                                            int seed, int offset)
+      : mean(mean),
+        std(std),
+        seed(seed),
+        numeric_min(numeric_min),
+        offset_(offset) {
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+  }
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    rng.discard(n);
+    T value = dist(rng);
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+  }
+};
+
 template <typename T>
 class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
  public:
@@ -54,14 +86,31 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          TruncatedNormalOffset<T>(mean, std, std::numeric_limits<T>::min(),
+                                   seed_offset.first, seed_offset.second));
+    }
+
     thrust::transform(
         index_sequence_begin, index_sequence_begin + size,
         thrust::device_ptr<T>(data),
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 4df1e0ffeb97564803f452114d52ab03d0464f8a..6237137cccbc6840b345c9e26dda1ccdc8df43b0 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -51,6 +51,39 @@ struct UniformGenerator {
   }
 };
 
+template <typename T>
+struct UniformGeneratorOffset {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  int offset_;
+  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
+                                             int diag_num, int diag_step,
+                                             T diag_val, int offset)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val),
+        offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -89,10 +122,11 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
 
     T min = static_cast<T>(context.Attr<float>("min"));
@@ -104,10 +138,23 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    thrust::transform(
-        index_sequence_begin, index_sequence_begin + size,
-        thrust::device_ptr<T>(data),
-        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
+                                    diag_step, diag_val, gen_offset));
+    } else {
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
+    }
   }
 };
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 652b4dd47daa8aecdcae43e8c910d7dd61bbb64d..ef827fd74903afd007c864307e942749e3eb0bd1 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -136,6 +136,8 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
 
+cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
+
 nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
 
 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..742329abb2dae20437120c0d4ba5975d41b0a7c9
--- /dev/null
+++ b/paddle/fluid/platform/bfloat16.h
@@ -0,0 +1,439 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <limits>
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#include <cstring>
+#include "paddle/fluid/platform/hostdevice.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace platform {
+
+struct PADDLE_ALIGN(2) bfloat16 {
+ public:
+  uint16_t x;
+
+  bfloat16() = default;
+  bfloat16(const bfloat16& o) = default;
+  bfloat16& operator=(const bfloat16& o) = default;
+  bfloat16(bfloat16&& o) = default;
+  bfloat16& operator=(bfloat16&& o) = default;
+  ~bfloat16() = default;
+
+  HOSTDEVICE inline explicit bfloat16(float val) {
+    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+  }
+
+  template <class T>
+  HOSTDEVICE inline explicit bfloat16(const T& val)
+      : x(bfloat16(static_cast<float>(val)).x) {}
+
+  HOSTDEVICE inline bfloat16& operator=(bool b) {
+    x = b ? 0x3f80 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(float val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(double val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline explicit operator float() const {
+    float val = 0.f;
+    uint16_t temp = x;
+    memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp),
+           2);
+    return val;
+  }
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(static_cast<float>(*this));
+  }
+};
+
+HOSTDEVICE inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a) {
+  bfloat16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+HOSTDEVICE inline bfloat16& operator+=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) + static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator-=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) - static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator*=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) * static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator/=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) / static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
+  bfloat16 res;
+  res.x = a;
+  return res;
+}
+
+HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator!=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+  return (a.x & 0x7FFF) > 0x7F80;
+}
+
+HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+  return (a.x & 0x7F80) == 0x7F80;
+}
+
+HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+  return !((isnan)(a)) && !((isinf)(a));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
+  os << a.x;
+  return os;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+namespace std {
+
+template <>
+struct is_pod<paddle::platform::bfloat16> {
+  static const bool value =
+      is_trivial<paddle::platform::bfloat16>::value &&
+      is_standard_layout<paddle::platform::bfloat16>::value;
+};
+
+template <>
+struct is_floating_point<paddle::platform::bfloat16>
+    : std::integral_constant<
+          bool, std::is_same<paddle::platform::bfloat16,
+                             typename std::remove_cv<
+                                 paddle::platform::bfloat16>::type>::value> {};
+template <>
+struct is_signed<paddle::platform::bfloat16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<paddle::platform::bfloat16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const paddle::platform::bfloat16& a) {
+  return paddle::platform::isnan(a);
+}
+
+inline bool isinf(const paddle::platform::bfloat16& a) {
+  return paddle::platform::isinf(a);
+}
+
+template <>
+struct numeric_limits<paddle::platform::bfloat16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 8;
+  static const int digits10 = 2;
+  static const int max_digits10 = 9;
+  static const int radix = 2;
+  static const int min_exponent = -125;
+  static const int min_exponent10 = -37;
+  static const int max_exponent = 128;
+  static const int max_exponent10 = 38;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static paddle::platform::bfloat16(min)() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x007f);
+  }
+  static paddle::platform::bfloat16 lowest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
+  }
+  static paddle::platform::bfloat16(max)() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
+  }
+  static paddle::platform::bfloat16 epsilon() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
+  }
+  static paddle::platform::bfloat16 round_error() {
+    return paddle::platform::bfloat16(0.5);
+  }
+  static paddle::platform::bfloat16 infinity() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
+  }
+  static paddle::platform::bfloat16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
+  }
+  static paddle::platform::bfloat16 signaling_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff81);
+  }
+  static paddle::platform::bfloat16 denorm_min() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x0001);
+  }
+};
+
+}  // namespace std
+
+namespace Eigen {
+
+using bfloat16 = paddle::platform::bfloat16;
+
+template <>
+struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  HOSTDEVICE static inline bfloat16 epsilon() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
+  }
+  HOSTDEVICE static inline bfloat16 dummy_precision() {
+    return bfloat16(1e-5f);
+  }
+  HOSTDEVICE static inline bfloat16 highest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
+  }
+  HOSTDEVICE static inline bfloat16 lowest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
+  }
+  HOSTDEVICE static inline bfloat16 infinity() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
+  }
+  HOSTDEVICE static inline bfloat16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
+  }
+};
+namespace numext {
+
+template <>
+HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline bfloat16 exp(const bfloat16& a) {
+  return bfloat16(::expf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 erf(const bfloat16& a) {
+  return bfloat16(::erff(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 log(const bfloat16& a) {
+  return bfloat16(::logf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) {
+  return bfloat16(::tanhf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) {
+  return bfloat16(::sqrtf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) {
+  return bfloat16(::ceilf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 floor(const bfloat16& a) {
+  return bfloat16(::floorf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 round(const bfloat16& a) {
+  return bfloat16(::roundf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::powf(static_cast<float>(a), static_cast<float>(b)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 abs(const bfloat16& a) {
+  return bfloat16(::fabs(static_cast<float>(a)));
+}
+
+}  // namespace numext
+}  // namespace Eigen
diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bdb508ee33630004daae132fcdcf71146a50e640
--- /dev/null
+++ b/paddle/fluid/platform/bfloat16_test.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/bfloat16.h"
+
+#include <vector>
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
+
+namespace paddle {
+namespace platform {
+
+using bfloat16 = paddle::platform::bfloat16;
+
+TEST(bfloat16, conversion_cpu) {
+  // Conversion from float
+  EXPECT_EQ(bfloat16(1.0f).x, 0x3f80);
+  EXPECT_EQ(bfloat16(0.5f).x, 0x3f00);
+  EXPECT_EQ(bfloat16(0.33333f).x, 0x3eaa);
+  EXPECT_EQ(bfloat16(0.0f).x, 0x0000);
+  EXPECT_EQ(bfloat16(-0.0f).x, 0x8000);
+  EXPECT_EQ(bfloat16(65504.0f).x, 0x477f);
+  EXPECT_EQ(bfloat16(65536.0f).x, 0x4780);
+
+  // Conversion from double
+  EXPECT_EQ(bfloat16(1.0).x, 0x3f80);
+  EXPECT_EQ(bfloat16(0.5).x, 0x3f00);
+  EXPECT_EQ(bfloat16(0.33333).x, 0x3eaa);
+  EXPECT_EQ(bfloat16(0.0).x, 0x0000);
+  EXPECT_EQ(bfloat16(-0.0).x, 0x8000);
+  EXPECT_EQ(bfloat16(65504.0).x, 0x477f);
+  EXPECT_EQ(bfloat16(65536.0).x, 0x4780);
+
+  // Conversion from int
+  EXPECT_EQ(bfloat16(-1).x, 0xbf80);
+  EXPECT_EQ(bfloat16(0).x, 0x0000);
+  EXPECT_EQ(bfloat16(1).x, 0x3f80);
+  EXPECT_EQ(bfloat16(2).x, 0x4000);
+  EXPECT_EQ(bfloat16(3).x, 0x4040);
+
+  // Conversion from bool
+  EXPECT_EQ(bfloat16(true).x, 0x3f80);
+  EXPECT_EQ(bfloat16(false).x, 0x0000);
+
+  // Assignment operator
+  bfloat16 v_assign;
+  v_assign = bfloat16(0.f);
+  EXPECT_EQ(v_assign.x, 0x0000);
+  v_assign = 0.5f;
+  EXPECT_EQ(v_assign.x, 0x3f00);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3eaa);
+  v_assign = -1;
+  EXPECT_EQ(v_assign.x, 0xbf80);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(bfloat16(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(bfloat16(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(bfloat16(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(bfloat16(true)), true);
+}
+
+TEST(bfloat16, arithmetic_cpu) {
+  EXPECT_NEAR(static_cast<float>(bfloat16(1) + bfloat16(1)), 2, 0.001);
+  EXPECT_EQ(static_cast<float>(bfloat16(5) + bfloat16(-5)), 0);
+  EXPECT_NEAR(static_cast<float>(bfloat16(0.33333f) + bfloat16(0.66667f)), 1.0f,
+              0.01);
+  EXPECT_EQ(static_cast<float>(bfloat16(3) - bfloat16(5)), -2);
+  EXPECT_NEAR(static_cast<float>(bfloat16(0.66667f) - bfloat16(0.33333f)),
+              0.33334f, 0.01);
+  EXPECT_NEAR(static_cast<float>(bfloat16(3.3f) * bfloat16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(static_cast<float>(bfloat16(-2.1f) * bfloat16(-3.0f)), 6.3f, 0.1);
+  EXPECT_NEAR(static_cast<float>(bfloat16(2.0f) / bfloat16(3.0f)), 0.66667f,
+              0.01);
+  EXPECT_EQ(static_cast<float>(bfloat16(1.0f) / bfloat16(2.0f)), 0.5f);
+  EXPECT_EQ(static_cast<float>(-bfloat16(512.0f)), -512.0f);
+  EXPECT_EQ(static_cast<float>(-bfloat16(-512.0f)), 512.0f);
+}
+
+TEST(bfloat16, comparison_cpu) {
+  EXPECT_TRUE(bfloat16(1.0f) == bfloat16(1.0f));
+  EXPECT_FALSE(bfloat16(-1.0f) == bfloat16(-0.5f));
+  EXPECT_TRUE(bfloat16(1.0f) != bfloat16(0.5f));
+  EXPECT_FALSE(bfloat16(-1.0f) != bfloat16(-1.0f));
+  EXPECT_TRUE(bfloat16(1.0f) < bfloat16(2.0f));
+  EXPECT_FALSE(bfloat16(-1.0f) < bfloat16(-1.0f));
+  EXPECT_TRUE(bfloat16(1.0f) <= bfloat16(1.0f));
+  EXPECT_TRUE(bfloat16(2.0f) > bfloat16(1.0f));
+  EXPECT_FALSE(bfloat16(-2.0f) > bfloat16(-2.0f));
+  EXPECT_TRUE(bfloat16(2.0f) >= bfloat16(2.0f));
+}
+
+TEST(bfloat16, lod_tensor_cpu) {
+  framework::LoDTensor lod_tensor;
+
+  std::vector<bfloat16> input_data = {bfloat16(1.0f), bfloat16(0.5f),
+                                      bfloat16(0.33333f), bfloat16(0.0f)};
+  EXPECT_EQ(input_data[0].x, 0x3f80);
+  EXPECT_EQ(input_data[1].x, 0x3f00);
+  EXPECT_EQ(input_data[2].x, 0x3eaa);
+  EXPECT_EQ(input_data[3].x, 0x0000);
+
+  lod_tensor.Resize({4, 1});
+  lod_tensor.set_lod(framework::LoD({{0, 2, 4}}));
+  bfloat16* data_ptr = lod_tensor.mutable_data<bfloat16>(CPUPlace());
+
+  EXPECT_NE(data_ptr, nullptr);
+  EXPECT_EQ(input_data.size(), static_cast<size_t>(lod_tensor.numel()));
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    data_ptr[i] = input_data[i];
+    EXPECT_EQ(data_ptr[i].x, input_data[i].x);
+  }
+}
+
+TEST(bfloat16, floating) {
+  // compile time assert.
+  PADDLE_ENFORCE_EQ(
+      std::is_floating_point<bfloat16>::value, true,
+      platform::errors::Fatal("std::is_floating_point with bfloat16 data type "
+                              "should be equal to true but it is not"));
+}
+
+TEST(bfloat16, print) {
+  bfloat16 a = bfloat16(1.0f);
+  std::cout << a << std::endl;
+}
+
+// CPU test
+TEST(bfloat16, isinf) {
+  bfloat16 a;
+  a.x = 0x7f80;
+  bfloat16 b = bfloat16(INFINITY);
+  bfloat16 c = static_cast<bfloat16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(bfloat16, isnan) {
+  bfloat16 a;
+  a.x = 0x7fff;
+  bfloat16 b = bfloat16(NAN);
+  bfloat16 c = static_cast<bfloat16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index efb57e12fdbe650e74101355da73be929f072be7..bbe847e7190d6f9812dcc814d4b4fe74a0cc7ef6 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -273,11 +273,116 @@ class ScopedTensorDescriptor {
                       groups);
   }
 
+  inline cudnnTensorDescriptor_t descriptor(const cudnnDataType_t cudnn_type,
+                                            const std::vector<int>& dim,
+                                            const std::vector<int>& stride) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+        desc_, cudnn_type, dim.size(), dim.data(), stride.data()));
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnTensorDescriptor_t descriptor(const std::vector<int>& dim,
+                                            const std::vector<int>& stride) {
+    return descriptor(CudnnDataType<T>::type, dim, stride);
+  }
+
  private:
   cudnnTensorDescriptor_t desc_;
   DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
 };
 
+class ScopedRNNTensorDescriptor {
+ public:
+  ScopedRNNTensorDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateRNNDataDescriptor(&desc_));
+  }
+
+  ~ScopedRNNTensorDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDataDescriptor(desc_));
+  }
+
+  inline cudnnRNNDataDescriptor_t descriptor(
+      const cudnnDataType_t cudnn_type, int max_seq_length, int batch_size,
+      int input_size, bool time_major, const std::vector<int>& seq_length) {
+    static float padding_fill = 0.0f;
+    cudnnRNNDataLayout_t layout;
+
+    if (time_major) {
+      layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED;
+    } else {
+      layout = CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED;
+    }
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetRNNDataDescriptor(
+        desc_, cudnn_type, layout, max_seq_length, batch_size, input_size,
+        seq_length.data(), static_cast<void*>(&padding_fill)));
+
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnRNNDataDescriptor_t descriptor(
+      int max_length, int batch_size, int input_size, bool time_major,
+      const std::vector<int>& seq_length) {
+    return descriptor(CudnnDataType<T>::type, max_length, batch_size,
+                      input_size, time_major, seq_length);
+  }
+
+ private:
+  cudnnRNNDataDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedRNNTensorDescriptor);
+};
+
+class ScopedDropoutDescriptor {
+ public:
+  ScopedDropoutDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateDropoutDescriptor(&desc_));
+  }
+  ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyDropoutDescriptor(desc_));
+  }
+
+  inline cudnnDropoutDescriptor_t descriptor(const cudnnHandle_t& handle,
+                                             const platform::Place& place,
+                                             bool initialized,
+                                             float dropout_prob_,
+                                             framework::Tensor* dropout_state_,
+                                             int seed, size_t state_size) {
+    auto* dropout_state_data = dropout_state_->data<uint8_t>();
+    if (!initialized) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor(
+          desc_, handle, dropout_prob_, dropout_state_data, state_size, seed));
+    } else {
+      auto dropout_state_dims = dropout_state_->dims();
+      state_size = dropout_state_dims[0];
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnRestoreDropoutDescriptor(
+          desc_, handle, dropout_prob_, dropout_state_data, state_size, 0));
+    }
+    return desc_;
+  }
+
+ private:
+  cudnnDropoutDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedDropoutDescriptor);
+};
+
+class ScopedRNNDescriptor {
+ public:
+  ScopedRNNDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateRNNDescriptor(&desc_));
+  }
+  ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDescriptor(desc_));
+  }
+
+  inline cudnnRNNDescriptor_t descriptor() { return desc_; }
+
+ private:
+  cudnnRNNDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedRNNDescriptor);
+};
+
 class ScopedFilterDescriptor {
  public:
   ScopedFilterDescriptor() {
@@ -319,6 +424,167 @@ class ScopedFilterDescriptor {
   DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
 };
 
+class ScopedRNNBase {
+ public:
+  ScopedRNNBase(int seq_length, int batch_size, int input_size, int hidden_size,
+                int num_layers, float dropout_prob, int seed, int weight_numel,
+                bool initialized, bool is_bidirec)
+      : seq_length_(seq_length),
+        batch_size_(batch_size),
+        input_size_(input_size),
+        hidden_size_(hidden_size),
+        num_layers_(num_layers),
+        dropout_prob_(dropout_prob),
+        seed_(seed),
+        weight_numel_(weight_numel),
+        initialized_(initialized),
+        is_bidirec_(is_bidirec) {}
+
+  template <typename T>
+  void Create(const cudnnHandle_t& handle, const platform::Place& place,
+              std::vector<int> sequence_length, size_t* workspace_size,
+              size_t* reserve_size, framework::Tensor* dropout_state) {
+    int numDirections = is_bidirec_ ? 2 : 1;
+    cudnnDataType_t cudnn_type = platform::CudnnDataType<T>::type;
+
+    // ------------------- cudnn x, y descriptors ---------------------
+    std::vector<int> dims_x = {batch_size_, input_size_, 1};
+    std::vector<int> strides_x = {input_size_, 1, 1};
+
+    std::vector<int> dims_y = {batch_size_, hidden_size_ * numDirections, 1};
+    std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
+
+    for (int i = 0; i < seq_length_; ++i) {
+      x_desc_.emplace_back(x_d.descriptor<T>(dims_x, strides_x));
+      y_desc_.emplace_back(y_d.descriptor<T>(dims_y, strides_y));
+    }
+
+    if (!sequence_length.empty()) {
+      x_seq_desc_ = x_seq_d.descriptor<T>(seq_length_, batch_size_, input_size_,
+                                          true, sequence_length);
+      y_seq_desc_ = y_seq_d.descriptor<T>(seq_length_, batch_size_,
+                                          hidden_size_ * numDirections, true,
+                                          sequence_length);
+    }
+
+    // ------------------- cudnn hx, hy, cx, cy descriptors----------
+    std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
+                                hidden_size_};
+    std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
+
+    hx_desc_ = hx_d.descriptor<T>(dims_hx, strides_hx);
+    cx_desc_ = cx_d.descriptor<T>(dims_hx, strides_hx);
+    hy_desc_ = hy_d.descriptor<T>(dims_hx, strides_hx);
+    cy_desc_ = cy_d.descriptor<T>(dims_hx, strides_hx);
+
+    // ------------------- cudnn dropout descriptors ---------------------
+    size_t state_size;
+    if (!initialized_) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          dynload::cudnnDropoutGetStatesSize(handle, &state_size));
+      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
+                                           place);
+    }
+    dropout_desc_ =
+        dropout_d.descriptor(handle, place, initialized_, dropout_prob_,
+                             dropout_state, seed_, state_size);
+
+    // ------------------- cudnn rnn descriptors ---------------------
+    rnn_desc_ = rnn_d.descriptor();
+
+#if CUDNN_VERSION >= 6000
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
+        CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_RNN_ALGO_STANDARD, cudnn_type));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
+        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        cudnn_type));
+#endif
+    if (!sequence_length.empty()) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
+          rnn_desc_, CUDNN_RNN_PADDED_IO_ENABLED));
+    }
+    // ------------------- cudnn weights_size ---------------------
+    size_t weights_size_;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+        handle, rnn_desc_, x_desc_[0], &weights_size_, cudnn_type));
+
+    PADDLE_ENFORCE_EQ(
+        weights_size_, sizeof(T) * weight_numel_,
+        platform::errors::InvalidArgument(
+            "The cudnn lstm and setting weight size should be same."));
+
+    // ------------------- cudnn weight descriptors ---------------------
+    platform::DataLayout layout = platform::DataLayout::kNCHW;
+    int dim_tmp = weights_size_ / sizeof(T);
+    std::vector<int> dim_w = {dim_tmp, 1, 1};
+    w_desc_ = w_d.descriptor<T>(layout, dim_w);
+
+    // ------------------- cudnn workspace, reserve size ---------------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+        handle, rnn_desc_, seq_length_, x_desc_.data(), workspace_size));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetRNNTrainingReserveSize(
+            handle, rnn_desc_, seq_length_, x_desc_.data(), reserve_size));
+  }
+
+  cudnnTensorDescriptor_t* x_desc() { return x_desc_.data(); }
+  cudnnTensorDescriptor_t* y_desc() { return y_desc_.data(); }
+  cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_; }
+  cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_; }
+  cudnnTensorDescriptor_t hx_desc() { return hx_desc_; }
+  cudnnTensorDescriptor_t cx_desc() { return cx_desc_; }
+  cudnnTensorDescriptor_t hy_desc() { return hy_desc_; }
+  cudnnTensorDescriptor_t cy_desc() { return cy_desc_; }
+  cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_; }
+  cudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_; }
+  cudnnFilterDescriptor_t w_desc() { return w_desc_; }
+
+ private:
+  int seq_length_;
+  int batch_size_;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  float dropout_prob_;
+  int seed_;
+  int weight_numel_;
+  bool initialized_;
+  bool is_bidirec_;
+
+  std::vector<cudnnTensorDescriptor_t> x_desc_;
+  std::vector<cudnnTensorDescriptor_t> y_desc_;
+  cudnnRNNDataDescriptor_t x_seq_desc_;
+  cudnnRNNDataDescriptor_t y_seq_desc_;
+  // A tensor descriptor describing the initial hidden state of the RNN.
+  cudnnTensorDescriptor_t hx_desc_;
+  // A tensor descriptor describing the initial cell state for LSTM networks.
+  cudnnTensorDescriptor_t cx_desc_;
+  // A tensor descriptor describing the final hidden state of the RNN.
+  cudnnTensorDescriptor_t hy_desc_;
+  // A tensor descriptor describing the final cell state for LSTM networks.
+  cudnnTensorDescriptor_t cy_desc_;
+  cudnnDropoutDescriptor_t dropout_desc_;
+  cudnnFilterDescriptor_t w_desc_;
+  cudnnRNNDescriptor_t rnn_desc_;
+
+  ScopedTensorDescriptor x_d;
+  ScopedTensorDescriptor y_d;
+  ScopedRNNTensorDescriptor x_seq_d;
+  ScopedRNNTensorDescriptor y_seq_d;
+  ScopedTensorDescriptor hx_d;
+  ScopedTensorDescriptor cx_d;
+  ScopedTensorDescriptor hy_d;
+  ScopedTensorDescriptor cy_d;
+  ScopedDropoutDescriptor dropout_d;
+  ScopedFilterDescriptor w_d;
+  ScopedRNNDescriptor rnn_d;
+};
+
 class ScopedConvolutionDescriptor {
  public:
   ScopedConvolutionDescriptor() {
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 7e32720c1d733411178c102d5c4500f722e7d005..562e7542012247c86add9e64f182d857ea969c60 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -38,14 +38,15 @@ extern void *cublas_dso_handle;
  */
 #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
   struct DynLoad__##__name {                                                 \
-    using FUNC_TYPE = decltype(&::__name);                                   \
     template <typename... Args>                                              \
-    inline cublasStatus_t operator()(Args... args) {                         \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {  \
+      using cublas_func =                                                    \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);          \
       std::call_once(cublas_dso_flag, []() {                                 \
         cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
       });                                                                    \
       static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
-      return reinterpret_cast<FUNC_TYPE>(p_##__name)(args...);               \
+      return reinterpret_cast<cublas_func>(p_##__name)(args...);             \
     }                                                                        \
   };                                                                         \
   extern DynLoad__##__name __name
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index ebeb14e940e5fd904e506bca565c4aeae84c93cf..7e85cb57f339331d5dd4233c2cad562c56d1d3af 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -101,6 +101,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnDropoutGetStatesSize);                     \
   __macro(cudnnSetDropoutDescriptor);                     \
   __macro(cudnnRestoreDropoutDescriptor);                 \
+  __macro(cudnnCreateRNNDataDescriptor);                  \
+  __macro(cudnnDestroyRNNDataDescriptor);                 \
+  __macro(cudnnSetRNNDataDescriptor);                     \
   __macro(cudnnCreateRNNDescriptor);                      \
   __macro(cudnnGetRNNParamsSize);                         \
   __macro(cudnnGetRNNWorkspaceSize);                      \
@@ -109,6 +112,11 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnRNNBackwardData);                          \
   __macro(cudnnRNNBackwardWeights);                       \
   __macro(cudnnRNNForwardInference);                      \
+  __macro(cudnnRNNForwardTrainingEx);                     \
+  __macro(cudnnSetRNNPaddingMode);                        \
+  __macro(cudnnRNNBackwardDataEx);                        \
+  __macro(cudnnRNNBackwardWeightsEx);                     \
+  __macro(cudnnRNNForwardInferenceEx);                    \
   __macro(cudnnDestroyDropoutDescriptor);                 \
   __macro(cudnnDestroyRNNDescriptor);                     \
   __macro(cudnnSetTensorNdDescriptorEx);
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 3782eb684f21f8c09e9dac124082ae596fe5d1bc..8fb66c6f34bd8453f1aceb731bb1cd94b8e75a69 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -161,6 +161,12 @@ inline mkldnn::memory::data_type MKLDNNGetDataType<uint8_t>() {
   return mkldnn::memory::data_type::u8;
 }
 
+template <>
+inline mkldnn::memory::data_type
+MKLDNNGetDataType<paddle::platform::bfloat16>() {
+  return mkldnn::memory::data_type::bf16;
+}
+
 inline void Reorder(mkldnn::memory src, mkldnn::memory dst,
                     const mkldnn::engine& engine) {
   auto reorder_prim = mkldnn::reorder(src, dst);
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 90b7f501052530a306ba22ea6a244f0ef8fad563..67121e24089f7c6c5b8de985da89039eca85f094 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -59,6 +59,7 @@ void BindGenerator(py::module* m_ptr) {
       .def_property("_is_init_py", &framework::Generator::GetIsInitPy,
                     &framework::Generator::SetIsInitPy);
   m.def("default_cpu_generator", &framework::DefaultCPUGenerator);
-}  // end Generator
-}  // end namespace pybind
+  m.def("default_cuda_generator", &framework::GetDefaultCUDAGenerator);
+}
+}  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 040dd313f1c538b5792538f9da04635ff805b9a8..be4d90597e1e1c647ac6750ee7cebdc2ede8a551 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -60,6 +60,9 @@ void BindAnalysisConfig(py::module *m);
 void BindAnalysisPredictor(py::module *m);
 void BindZeroCopyTensor(py::module *m);
 void BindPaddlePassBuilder(py::module *m);
+void BindPaddleInferPredictor(py::module *m);
+void BindPaddleInferTensor(py::module *m);
+void BindPredictorPool(py::module *m);
 
 #ifdef PADDLE_WITH_MKLDNN
 void BindMkldnnQuantizerConfig(py::module *m);
@@ -139,6 +142,15 @@ void ZeroCopyTensorCreate(ZeroCopyTensor &tensor,  // NOLINT
   tensor.copy_from_cpu(static_cast<const T *>(data.data()));
 }
 
+template <typename T>
+void PaddleInferTensorCreate(paddle_infer::Tensor &tensor,  // NOLINT
+                             py::array_t<T> data) {
+  std::vector<int> shape;
+  std::copy_n(data.shape(), data.ndim(), std::back_inserter(shape));
+  tensor.Reshape(std::move(shape));
+  tensor.CopyFromCpu(static_cast<const T *>(data.data()));
+}
+
 size_t PaddleGetDTypeSize(PaddleDType dt) {
   size_t size{0};
   switch (dt) {
@@ -183,6 +195,30 @@ py::array ZeroCopyTensorToNumpy(ZeroCopyTensor &tensor) {  // NOLINT
   return array;
 }
 
+py::array PaddleInferTensorToNumpy(paddle_infer::Tensor &tensor) {  // NOLINT
+  py::dtype dt = PaddleDTypeToNumpyDType(tensor.type());
+  auto tensor_shape = tensor.shape();
+  py::array::ShapeContainer shape(tensor_shape.begin(), tensor_shape.end());
+  py::array array(dt, std::move(shape));
+
+  switch (tensor.type()) {
+    case PaddleDType::INT32:
+      tensor.CopyToCpu(static_cast<int32_t *>(array.mutable_data()));
+      break;
+    case PaddleDType::INT64:
+      tensor.CopyToCpu(static_cast<int64_t *>(array.mutable_data()));
+      break;
+    case PaddleDType::FLOAT32:
+      tensor.CopyToCpu<float>(static_cast<float *>(array.mutable_data()));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported data type. Now only supports INT32, INT64 and "
+          "FLOAT32."));
+  }
+  return array;
+}
+
 py::bytes SerializePDTensorToBytes(PaddleTensor &tensor) {  // NOLINT
   std::stringstream ss;
   paddle::inference::SerializePDTensorToStream(&ss, tensor);
@@ -200,8 +236,11 @@ void BindInferenceApi(py::module *m) {
   BindNativePredictor(m);
   BindAnalysisConfig(m);
   BindAnalysisPredictor(m);
+  BindPaddleInferPredictor(m);
   BindZeroCopyTensor(m);
+  BindPaddleInferTensor(m);
   BindPaddlePassBuilder(m);
+  BindPredictorPool(m);
 #ifdef PADDLE_WITH_MKLDNN
   BindMkldnnQuantizerConfig(m);
 #endif
@@ -209,8 +248,17 @@ void BindInferenceApi(py::module *m) {
          &paddle::CreatePaddlePredictor<AnalysisConfig>, py::arg("config"));
   m->def("create_paddle_predictor",
          &paddle::CreatePaddlePredictor<NativeConfig>, py::arg("config"));
+  m->def("create_predictor", [](const paddle_infer::Config &config)
+                                 -> std::unique_ptr<paddle_infer::Predictor> {
+                                   auto pred =
+                                       std::unique_ptr<paddle_infer::Predictor>(
+                                           new paddle_infer::Predictor(config));
+                                   return std::move(pred);
+                                 });
   m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
   m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
+  m->def("get_version", &paddle_infer::GetVersion);
+  m->def("get_num_bytes_of_data_type", &paddle_infer::GetNumBytesOfDataType);
 }
 
 namespace {
@@ -525,6 +573,19 @@ void BindAnalysisPredictor(py::module *m) {
            py::arg("dir"));
 }
 
+void BindPaddleInferPredictor(py::module *m) {
+  py::class_<paddle_infer::Predictor>(*m, "PaddleInferPredictor")
+      .def(py::init<const paddle_infer::Config &>())
+      .def("get_input_names", &paddle_infer::Predictor::GetInputNames)
+      .def("get_output_names", &paddle_infer::Predictor::GetOutputNames)
+      .def("get_input_handle", &paddle_infer::Predictor::GetInputHandle)
+      .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
+      .def("run", &paddle_infer::Predictor::Run)
+      .def("clone", &paddle_infer::Predictor::Clone)
+      .def("clear_intermediate_tensor",
+           &paddle_infer::Predictor::ClearIntermediateTensor);
+}
+
 void BindZeroCopyTensor(py::module *m) {
   py::class_<ZeroCopyTensor>(*m, "ZeroCopyTensor")
       .def("reshape", &ZeroCopyTensor::Reshape)
@@ -538,6 +599,26 @@ void BindZeroCopyTensor(py::module *m) {
       .def("type", &ZeroCopyTensor::type);
 }
 
+void BindPaddleInferTensor(py::module *m) {
+  py::class_<paddle_infer::Tensor>(*m, "PaddleInferTensor")
+      .def("reshape", &paddle_infer::Tensor::Reshape)
+      .def("copy_from_cpu", &PaddleInferTensorCreate<int32_t>)
+      .def("copy_from_cpu", &PaddleInferTensorCreate<int64_t>)
+      .def("copy_from_cpu", &PaddleInferTensorCreate<float>)
+      .def("copy_to_cpu", &PaddleInferTensorToNumpy)
+      .def("shape", &paddle_infer::Tensor::shape)
+      .def("set_lod", &paddle_infer::Tensor::SetLoD)
+      .def("lod", &paddle_infer::Tensor::lod)
+      .def("type", &paddle_infer::Tensor::type);
+}
+
+void BindPredictorPool(py::module *m) {
+  py::class_<paddle_infer::services::PredictorPool>(*m, "PredictorPool")
+      .def(py::init<const paddle_infer::Config &, size_t>())
+      .def("retrive", &paddle_infer::services::PredictorPool::Retrive,
+           py::return_value_policy::reference);
+}
+
 void BindPaddlePassBuilder(py::module *m) {
   py::class_<PaddlePassBuilder>(*m, "PaddlePassBuilder")
       .def(py::init<const std::vector<std::string> &>())
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 4377a8c2cef5aab7a200955cd25830d448014817..5ee15073267b6eac8978022a70ead5d0f439c62f 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "pybind11/numpy.h"
@@ -104,6 +105,7 @@ struct ValidDTypeToPyArrayChecker {
   }
 
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::bfloat16);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(float);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(double);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool);
@@ -119,6 +121,9 @@ inline std::string TensorDTypeToPyDTypeStr(
   if (type == proto_type) {                                                 \
     if (std::is_same<T, platform::float16>::value) {                        \
       return "e";                                                           \
+    } else if (std::is_same<T, platform::bfloat16>::value) {                \
+      /* NumPy character code of uint16 due to no support for bfloat16 */   \
+      return "H";                                                           \
     } else {                                                                \
       constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker<T>::kValue; \
       PADDLE_ENFORCE_EQ(                                                    \
@@ -262,10 +267,10 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj,
     SetTensorFromPyArrayT<paddle::platform::float16, P>(self, array, place,
                                                         zero_copy);
   } else if (py::isinstance<py::array_t<uint16_t>>(array)) {
-    // TODO(cql): temporary keeping uint16, which is used for casting float16
-    // before. It should be depracated later.
-    SetTensorFromPyArrayT<paddle::platform::float16, P>(self, array, place,
-                                                        zero_copy);
+    // since there is still no support for bfloat16 in NumPy,
+    // uint16 is used for casting bfloat16
+    SetTensorFromPyArrayT<paddle::platform::bfloat16, P>(self, array, place,
+                                                         zero_copy);
   } else if (py::isinstance<py::array_t<bool>>(array)) {
     SetTensorFromPyArrayT<bool, P>(self, array, place, zero_copy);
   } else {
@@ -479,6 +484,8 @@ inline framework::Tensor *_sliceTensor(const framework::Tensor &self,
   switch (src_type) {
     case framework::proto::VarType::FP16:
       return _sliceAndConcat<paddle::platform::float16>(self, obj, dim);
+    case framework::proto::VarType::BF16:
+      return _sliceAndConcat<paddle::platform::bfloat16>(self, obj, dim);
     case framework::proto::VarType::FP32:
       return _sliceAndConcat<float>(self, obj, dim);
     case framework::proto::VarType::FP64:
diff --git a/paddle/http.log b/paddle/http.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index ace015d2adbb1dd34c2845f5c18e4c38882fabc8..11932ce728889efde2c6cf5ecd8b8252e1192c19 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -20,13 +20,12 @@ rem       Paddle CI Task On Windows Platform
 rem =================================================
 
 rem -------clean up environment-----------
+wmic process where name="op_function_generator.exe" call terminate  2>NUL
 set work_dir=%cd%
-if exist build rmdir build /s/q
 mkdir build
 cd /d build
 tree .
 dir paddle\fluid\pybind\Release
-taskkill /f /im op_function_generator.exe  2>NUL
 
 rem ------initialize the virtual environment------
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
@@ -59,7 +58,7 @@ if not defined WITH_AVX set WITH_AVX=ON
 if not defined WITH_TESTING set WITH_TESTING=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
-if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=OFF
+if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
 if not defined WITH_TPCACHE set WITH_TPCACHE=ON
 
 rem ------set cache third_party------
@@ -243,7 +242,7 @@ dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
-ctest.exe --output-on-failure -C Release -j 8
+ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4
 goto:eof
 
 :unit_test_error
@@ -402,7 +401,7 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
-taskkill /f /im op_function_generator.exe  2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
 taskkill /f /im python.exe  2>NUL
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 taskkill /f /im python.exe  2>NUL
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 33472397dd0236c294db5df9ba753dcdae19956d..88d9e6e55d577dbb5b883d1c59e2c58d54373742 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -528,8 +528,50 @@ EOF
         elif [ "$1" == "cp37-cp37m" ]; then
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
+        tmpfile_rand=`date +%s%N`
+        tmpfile=$tmp_dir/$tmpfile_rand
+        set +e
         ut_startTime_s=`date +%s`
-        ctest --output-on-failure -j $2;mactest_error=$?
+        ctest --output-on-failure -j $2 | tee $tmpfile
+        failed_test_lists=''
+        collect_failed_tests
+        set +x
+        mactest_error=0
+        retry_unittests_record=''
+        retry_time=3
+        exec_times=0
+        exec_time_array=('first' 'second' 'third')
+        if [ -n "$failed_test_lists" ];then
+            mactest_error=1
+            while ( [ $exec_times -lt $retry_time ] && [ -n "${failed_test_lists}" ] )
+                do
+                    retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                    failed_test_lists_ult=`echo "${failed_test_lists}"`
+                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+                    echo "========================================="
+                    echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                    echo "========================================="
+                    echo "The following unittest will be re-run:"
+                    echo "${retry_unittests}"
+                    echo "========================================="
+
+                    retry_unittests_regular=''
+                    for line in ${retry_unittests[@]} ;
+                        do
+                            if [[ "$retry_unittests_regular" == "" ]];then
+                                retry_unittests_regular="^$line$"
+                            else
+                                retry_unittests_regular="$retry_unittests_regular|^$line$"
+                            fi
+                        done
+                    rm -f $tmp_dir/*
+                    failed_test_lists=''
+                    ctest -R "($retry_unittests_regular)" --output-on-failure -j $2 | tee $tmpfile
+                    collect_failed_tests
+                    exec_times=$[$exec_times+1]
+                done
+        fi
+        #mactest_error=$?
         ut_endTime_s=`date +%s`
         echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
         paddle version
@@ -537,7 +579,21 @@ EOF
         export http_proxy=$my_proxy
         export https_proxy=$my_proxy
         if [ "$mactest_error" != 0 ];then
-            exit 8;
+            if [[ "$failed_test_lists" == "" ]]; then
+                echo "========================================"
+                echo "There are failed tests, which have been successful after re-run:"
+                echo "========================================"
+                echo "The following tests have been re-ran:"
+                echo "${retry_unittests_record}"
+            else
+                failed_test_lists_ult=`echo "${failed_test_lists}"`
+                echo "========================================"
+                echo "Summary Failed Tests... "
+                echo "========================================"
+                echo "The following tests FAILED: "
+                echo "${failed_test_lists_ult}"
+                exit 8;
+            fi
         fi
     fi
 }
@@ -561,6 +617,7 @@ function fetch_upstream_develop_if_not_exist() {
 function generate_upstream_develop_api_spec() {
     fetch_upstream_develop_if_not_exist
     cur_branch=`git branch | grep \* | cut -d ' ' -f2`
+    git checkout .
     git checkout -b develop_base_pr upstream/$BRANCH
     cmake_gen $1
     build $2
@@ -1421,6 +1478,7 @@ function main() {
     init
     if [ "$CMD" != "assert_file_approvals" ];then
       python ${PADDLE_ROOT}/tools/summary_env.py
+      bash ${PADDLE_ROOT}/tools/get_cpu_info.sh
     fi
     case $CMD in
       build_only)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 5f1ccf3f858287066e36abf9412ba1114c526e61..d5793eb424ab794e3e8af8ef2312aac927c272e5 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -217,6 +217,8 @@ from .tensor.search import index_select  #DEFINE_ALIAS
 from .tensor.search import nonzero  #DEFINE_ALIAS
 from .tensor.search import sort  #DEFINE_ALIAS
 from .framework.random import manual_seed  #DEFINE_ALIAS
+from .framework.random import get_cuda_rng_state  #DEFINE_ALIAS
+from .framework.random import set_cuda_rng_state  #DEFINE_ALIAS
 from .framework import Variable  #DEFINE_ALIAS
 from .framework import ParamAttr  #DEFINE_ALIAS
 from .framework import create_global_var  #DEFINE_ALIAS
@@ -230,6 +232,7 @@ from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
+from .framework import SaveLoadConfig  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
 
 from .framework import NoamDecay  #DEFINE_ALIAS
@@ -267,5 +270,6 @@ from . import static
 # high-level api
 from .hapi import Model
 from .hapi import callbacks
+from .hapi import summary
 import paddle.text
 import paddle.vision
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index 5bc9c1444d2b34f057cd92782eb50e5fc23916eb..f7930d34f93e21bf3f832da828fb0036742b5091 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -74,7 +74,8 @@ def load_data(filename, feature_num=14, ratio=0.8):
     data = data.reshape(data.shape[0] // feature_num, feature_num)
     maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
         axis=0) / data.shape[0]
-    feature_range(maximums[:-1], minimums[:-1])
+    # if you want to print the distribution of input data, you could use function of feature_range
+    #feature_range(maximums[:-1], minimums[:-1])
     for i in six.moves.range(feature_num - 1):
         data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
     offset = int(data.shape[0] * ratio)
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 42ac68ba1a64de54f029878ceab08435c924d087..5f0cf9f93d62eba9b81e8a834b52f84122f2702d 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -50,3 +50,10 @@ distributed_optimizer = fleet.distributed_optimizer
 save_inference_model = fleet.save_inference_model
 save_persistables = fleet.save_persistables
 minimize = fleet.minimize
+distributed_model = fleet.distributed_model
+step = fleet.step
+clear_grad = fleet.clear_grad
+set_lr = fleet.set_lr
+get_lr = fleet.get_lr
+state_dict = fleet.state_dict
+set_state_dict = fleet.set_state_dict
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 26063d1b8a9225aff63628bb37f433ec95257dc7..62967a202ab53e8a5dc835900280259508bb640d 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -15,10 +15,25 @@
 import paddle
 from paddle.distributed.fleet.proto import distributed_strategy_pb2
 from paddle.fluid.framework import Variable, set_flags, core
+from paddle.fluid.wrapped_decorator import wrap_decorator
 import google.protobuf.text_format
 
 __all__ = ["DistributedStrategy"]
 
+non_auto_func_called = True
+
+
+def __non_auto_func_called__(func):
+    def __impl__(*args, **kwargs):
+        global non_auto_func_called
+        non_auto_func_called = False
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
+is_strict_auto = wrap_decorator(__non_auto_func_called__)
+
 
 def get_msg_dict(msg):
     res_dict = {}
@@ -118,7 +133,7 @@ class DistributedStrategy(object):
             strategy = fleet.DistributedStrategy()
             strategy.dgc = True
             strategy.recompute = True
-            strategy.recompute_configs = {"checkpoint": ["x"]}
+            strategy.recompute_configs = {"checkpoints": ["x"]}
             strategy.save_to_prototxt("dist_strategy.prototxt")
         """
         with open(output, "w") as fout:
@@ -133,7 +148,7 @@ class DistributedStrategy(object):
 
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
-            strategy.load_from_prototxt("dist_strategy.protoxt")
+            strategy.load_from_prototxt("dist_strategy.prototxt")
         """
         with open(pb_file, 'r') as f:
             self.strategy = google.protobuf.text_format.Merge(
@@ -147,6 +162,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
+            import paddle
             exe_strategy = paddle.fluid.ExecutionStrategy()
             exe_strategy.num_threads = 10
             exe_strategy.num_iteration_per_drop_scope = 10
@@ -163,6 +179,7 @@ class DistributedStrategy(object):
         return execution_strategy
 
     @execution_strategy.setter
+    @is_strict_auto
     def execution_strategy(self, strategy):
         fields = self.strategy.execution_strategy.DESCRIPTOR.fields
         for f in fields:
@@ -179,6 +196,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
+            import paddle
             build_strategy = paddle.fluid.BuildStrategy()
             build_strategy.enable_sequential_execution = True
             build_strategy.fuse_elewise_add_act_ops = True
@@ -201,6 +219,7 @@ class DistributedStrategy(object):
         return build_strategy
 
     @build_strategy.setter
+    @is_strict_auto
     def build_strategy(self, strategy):
         fields = self.strategy.build_strategy.DESCRIPTOR.fields
         for f in fields:
@@ -235,6 +254,7 @@ class DistributedStrategy(object):
         return self.strategy.a_sync
 
     @a_sync.setter
+    @is_strict_auto
     def a_sync(self, flag):
         if isinstance(flag, bool):
             self.strategy.a_sync = flag
@@ -252,14 +272,19 @@ class DistributedStrategy(object):
         a dict.
 
         **Notes**:
-            **Detailed arguments for a_sync_configs**
-            **k_step**: number of local optimization updates before communication
-            **max_merge_var_num**: maximum number of merged gradients before communication
-            **send_queue_size**: a buffer size of worker communication
-            **independent_recv_thread**: if we are using independent recv thread for communication
-            **thread_pool_size**: number of thread pool
-            **send_wait_times**: waiting time for sending gradients
-            **runtime_split_send_recv**: if we are using Tensor split for send and recv during runtime
+            k_step(int): number of local optimization updates before communication
+
+            max_merge_var_num(int): maximum number of merged gradients before communication
+
+            send_queue_size(int): a buffer size of worker communication
+
+            independent_recv_thread(bool): if we are using independent recv thread for communication
+
+            thread_pool_size(int): number of thread pool
+
+            send_wait_times(int): waiting time for sending gradients
+
+            runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime
 
         Examples:
           .. code-block:: python
@@ -270,15 +295,17 @@ class DistributedStrategy(object):
 
             strategy = fleet.DistributedStrategy()
             strategy.a_sync = True  # by default this is True
-            configs = {"k_step": 10000, "send_queue_size": 32}
+            configs = {"k_steps": 1024, "send_queue_size": 32}
             strategy.a_sync_configs = configs
 
             # code block for defining loss and local optimizer
             # sgd = fleet.distributed_optimizer(optimizer, strategy)
+
         """
         return get_msg_dict(self.strategy.a_sync_configs)
 
     @a_sync_configs.setter
+    @is_strict_auto
     def a_sync_configs(self, configs):
         check_configs_key(self.strategy.a_sync_configs, configs,
                           "a_sync_configs")
@@ -301,6 +328,7 @@ class DistributedStrategy(object):
         return self.strategy.amp
 
     @amp.setter
+    @is_strict_auto
     def amp(self, flag):
         if isinstance(flag, bool):
             self.strategy.amp = flag
@@ -314,14 +342,21 @@ class DistributedStrategy(object):
         settings that can be configured through a dict.
 
         **Notes**:
-            **init_loss_scaling(float)**: The initial loss scaling factor. Default 32768.
-            **use_dynamic_loss_scaling(bool)**: Whether to use dynamic loss scaling. Default True.
-            **incr_every_n_steps(int)**: Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
-            **decr_every_n_nan_or_inf(int)**: Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
-            **incr_ratio(float)**: The multiplier to use when increasing the loss scaling. Default 2.0.
-            **decr_ratio(float)**: The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
-            **custom_white_list(list[str])**: Users' custom white list which always execution fp16.
-            **custom_black_list(list[str])**: Users' custom black list which forbidden execution fp16.
+            init_loss_scaling(float): The initial loss scaling factor. Default 32768.
+
+            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. Default True.
+
+            incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
+
+            decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
+
+            incr_ratio(float): The multiplier to use when increasing the loss scaling. Default 2.0.
+
+            decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
+
+            custom_white_list(list[str]): Users' custom white list which always execution fp16.
+
+            custom_black_list(list[str]): Users' custom black list which forbidden execution fp16.
 
         Examples:
           .. code-block:: python
@@ -336,6 +371,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.amp_configs)
 
     @amp_configs.setter
+    @is_strict_auto
     def amp_configs(self, configs):
         check_configs_key(self.strategy.amp_configs, configs, "amp_configs")
         assign_configs_value(self.strategy.amp_configs, configs)
@@ -373,6 +409,7 @@ class DistributedStrategy(object):
         return self.strategy.sync_nccl_allreduce
 
     @sync_nccl_allreduce.setter
+    @is_strict_auto
     def sync_nccl_allreduce(self, flag):
         if isinstance(flag, bool):
             self.strategy.sync_nccl_allreduce = flag
@@ -396,6 +433,7 @@ class DistributedStrategy(object):
         return self.strategy.use_hierarchical_allreduce
 
     @use_hierarchical_allreduce.setter
+    @is_strict_auto
     def use_hierarchical_allreduce(self, flag):
         if isinstance(flag, bool):
             self.strategy.use_hierarchical_allreduce = flag
@@ -420,6 +458,7 @@ class DistributedStrategy(object):
         return self.strategy.hierarchical_allreduce_inter_nranks
 
     @hierarchical_allreduce_inter_nranks.setter
+    @is_strict_auto
     def hierarchical_allreduce_inter_nranks(self, value):
         if isinstance(value, int):
             self.strategy.hierarchical_allreduce_inter_nranks = value
@@ -446,6 +485,7 @@ class DistributedStrategy(object):
         return self.strategy.sync_batch_norm
 
     @sync_batch_norm.setter
+    @is_strict_auto
     def sync_batch_norm(self, flag):
         if isinstance(flag, bool):
             self.strategy.sync_batch_norm = flag
@@ -468,6 +508,7 @@ class DistributedStrategy(object):
         return self.strategy.fuse_all_reduce_ops
 
     @fuse_all_reduce_ops.setter
+    @is_strict_auto
     def fuse_all_reduce_ops(self, flag):
         if isinstance(flag, bool):
             self.strategy.fuse_all_reduce_ops = flag
@@ -491,6 +532,7 @@ class DistributedStrategy(object):
         return self.strategy.fuse_grad_size_in_MB
 
     @fuse_grad_size_in_MB.setter
+    @is_strict_auto
     def fuse_grad_size_in_MB(self, value):
         if isinstance(value, int):
             self.strategy.fuse_grad_size_in_MB = value
@@ -502,6 +544,7 @@ class DistributedStrategy(object):
         return self.strategy.fuse_grad_size_in_TFLOPS
 
     @_fuse_grad_size_in_TFLOPS.setter
+    @is_strict_auto
     def _fuse_grad_size_in_TFLOPS(self, value):
         if isinstance(value, float):
             self.strategy.fuse_grad_size_in_TFLOPS = value
@@ -528,6 +571,7 @@ class DistributedStrategy(object):
         return self.strategy.nccl_comm_num
 
     @nccl_comm_num.setter
+    @is_strict_auto
     def nccl_comm_num(self, value):
         if isinstance(value, int):
             self.strategy.nccl_comm_num = value
@@ -535,6 +579,7 @@ class DistributedStrategy(object):
             print("WARNING: nccl_comm_num should have value of int type")
 
     @recompute.setter
+    @is_strict_auto
     def recompute(self, flag):
         if isinstance(flag, bool):
             self.strategy.recompute = flag
@@ -553,12 +598,13 @@ class DistributedStrategy(object):
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.recompute = True
-            strategy.recompute_configs = {"checkpionts": ["x", "y"]}
+            strategy.recompute_configs = {"checkpoints": ["x", "y"]}
 
         """
         return get_msg_dict(self.strategy.recompute_configs)
 
     @recompute_configs.setter
+    @is_strict_auto
     def recompute_configs(self, configs):
         check_configs_key(self.strategy.recompute_configs, configs,
                           "checkpoint_configs")
@@ -583,6 +629,7 @@ class DistributedStrategy(object):
         return self.strategy.pipeline
 
     @pipeline.setter
+    @is_strict_auto
     def pipeline(self, flag):
         if isinstance(flag, bool):
             self.strategy.pipeline = flag
@@ -603,6 +650,7 @@ class DistributedStrategy(object):
 
         **Notes**:
             **Detailed arguments for pipeline_configs**
+
             **micro_batch**: the number of small batches in each user defined batch
 
         Examples:
@@ -618,6 +666,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.pipeline_configs)
 
     @pipeline_configs.setter
+    @is_strict_auto
     def pipeline_configs(self, configs):
         check_configs_key(self.strategy.pipeline_configs, configs,
                           "pipeline_configs")
@@ -626,10 +675,10 @@ class DistributedStrategy(object):
     @property
     def localsgd(self):
         """
-        Indicating whether we are using Local SGD training. For more details, please refer to
-        [Don't Use Large Mini-Batches, Use Local SGD](https://arxiv.org/pdf/1808.07217.pdf),
+        Indicating whether we are using Local SGD training. Default Value: False
+        For more details, please refer to
+        `Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.
 
-        Default Value: False
 
         Examples:
           .. code-block:: python
@@ -642,6 +691,7 @@ class DistributedStrategy(object):
         return self.strategy.localsgd
 
     @localsgd.setter
+    @is_strict_auto
     def localsgd(self, flag):
         if isinstance(flag, bool):
             self.strategy.localsgd = flag
@@ -655,13 +705,12 @@ class DistributedStrategy(object):
         setting that can be configured through a dict.
 
         **Notes**:
-            **k_steps(int)**: The local steps for training before parameter
-                synchronization. Default 1. If strategy.auto is set True, the
-                local steps will be calculated automatically during training.
-                The algorithm is referenced in this paper: 
-                [Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD](https://arxiv.org/pdf/1810.08313.pdf).
-                In this case, k_steps indicates the first local steps which
-                is suggested setting to 1.
+            k_steps(int) The local steps for training before parameter synchronization. Default 1.
+
+            If strategy.auto is set True, the local steps will be calculated automatically during training.
+            The algorithm is referenced in this paper: 
+            `Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
+            In this case, k_steps indicates the first local steps which is suggested setting to 1.
 
         Examples:
           .. code-block:: python
@@ -675,6 +724,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.localsgd_configs)
 
     @localsgd_configs.setter
+    @is_strict_auto
     def localsgd_configs(self, configs):
         check_configs_key(self.strategy.localsgd_configs, configs,
                           "localsgd_configs")
@@ -699,6 +749,7 @@ class DistributedStrategy(object):
         return self.strategy.dgc
 
     @dgc.setter
+    @is_strict_auto
     def dgc(self, flag):
         if isinstance(flag, bool):
             self.strategy.dgc = flag
@@ -712,14 +763,16 @@ class DistributedStrategy(object):
         settings that can be configured through a dict.
 
         **Notes**:
-            **rampup_begin_step(int)**: The beginning step from which gradient compression is implemented. Default 0.
-            **rampup_step(int)**: Time steps used in sparsity warm-up periods. Default is 1.
-                For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100,
-                it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array
-                ends, it will use 0.999 then and after.
-            **sparsity(list[float])**: Get top important element from gradient tensor, the ratio is (1 - sparsity).
-                Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important
-                element will be transmitted.
+            rampup_begin_step(int): The beginning step from which gradient compression is implemented. Default 0.
+
+            rampup_step(int): Time steps used in sparsity warm-up periods. Default is 1. \
+                    For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100, \
+                    it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array \
+                    ends, it will use 0.999 then and after.
+
+            sparsity(list[float]): Get top important element from gradient tensor, the ratio is (1 - sparsity). \
+                    Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important \
+                    element will be transmitted.
 
         Examples:
           .. code-block:: python
@@ -732,6 +785,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.dgc_configs)
 
     @dgc_configs.setter
+    @is_strict_auto
     def dgc_configs(self, configs):
         check_configs_key(self.strategy.dgc_configs, configs, "dgc_configs")
         assign_configs_value(self.strategy.dgc_configs, configs)
@@ -749,7 +803,8 @@ class DistributedStrategy(object):
         to model parameters.
 
         Examples:
-        .. code-block:: python
+          .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.gradient_merge = True
@@ -758,6 +813,7 @@ class DistributedStrategy(object):
         return self.strategy.gradient_merge
 
     @gradient_merge.setter
+    @is_strict_auto
     def gradient_merge(self, flag):
         if isinstance(flag, bool):
             self.strategy.gradient_merge = flag
@@ -768,11 +824,15 @@ class DistributedStrategy(object):
     def gradient_merge_configs(self):
         """
         the key-value configs of distribute_strategy
-        Keys: 
-            k_steps (int): the update period of the parameters
-            avg (bool): whether to average the gradients of each mini-batch,
-                the default value is `True`
-        Example:
+
+        **Note**:
+            k_steps(int): the update period of the parameters.
+
+            avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
+
+        Examples:
+          .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.gradient_merge = True
@@ -781,6 +841,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.gradient_merge_configs)
 
     @gradient_merge_configs.setter
+    @is_strict_auto
     def gradient_merge_configs(self, configs):
         check_configs_key(self.strategy.gradient_merge_configs, configs,
                           "gradient_configs")
@@ -805,6 +866,7 @@ class DistributedStrategy(object):
         return self.strategy.lars
 
     @lars.setter
+    @is_strict_auto
     def lars(self, flag):
         if isinstance(flag, bool):
             self.strategy.lars = flag
@@ -826,6 +888,7 @@ class DistributedStrategy(object):
 
         Examples:
           .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.lars = True
@@ -839,6 +902,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.lars_configs)
 
     @lars_configs.setter
+    @is_strict_auto
     def lars_configs(self, configs):
         check_configs_key(self.strategy.lars_configs, configs, "lars_configs")
         assign_configs_value(self.strategy.lars_configs, configs)
@@ -864,6 +928,7 @@ class DistributedStrategy(object):
         return self.strategy.lamb
 
     @lamb.setter
+    @is_strict_auto
     def lamb(self, flag):
         if isinstance(flag, bool):
             self.strategy.lamb = flag
@@ -882,6 +947,7 @@ class DistributedStrategy(object):
 
         Examples:
           .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.lamb = True
@@ -893,15 +959,21 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.lamb_configs)
 
     @lamb_configs.setter
+    @is_strict_auto
     def lamb_configs(self, configs):
         check_configs_key(self.strategy.lamb_configs, configs, "lamb_configs")
         assign_configs_value(self.strategy.lamb_configs, configs)
 
     @property
     def elastic(self):
+        """
+        Indicating whether we want to do current distributed training on clusters with elastic resources.
+        Currently, this is configuration is not valid.
+        """
         return self.strategy.elastic
 
     @elastic.setter
+    @is_strict_auto
     def elastic(self, flag):
         if isinstance(flag, bool):
             self.strategy.elastic = flag
@@ -910,6 +982,25 @@ class DistributedStrategy(object):
 
     @property
     def auto(self):
+        """
+        Indicating whether we are using auto-parallel configuration
+        This feature is currently an experimental feature. Currently, 
+        auto-parallelism can be used only when a user does not set any other
+        strategy configs except auto. For details, please reference the following
+        code example
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.auto = True
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        """
         return self.strategy.auto
 
     @auto.setter
@@ -921,9 +1012,27 @@ class DistributedStrategy(object):
 
     @property
     def cudnn_exhaustive_search(self):
+        """
+        Indicating whether to use exhaustive search method to choose convolution algorithms.
+        Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm.
+        This method is time-consuming, the choosed algorithm will be cached for the given layer specifications.
+        Once the layer specifications (like batch size, feature map size) are changed, it will search again.
+        Default Value: True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.cudnn_exhaustive_search = False
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        """
         return self.strategy.cudnn_exhaustive_search
 
     @cudnn_exhaustive_search.setter
+    @is_strict_auto
     def cudnn_exhaustive_search(self, flag):
         if isinstance(flag, bool):
             self.strategy.cudnn_exhaustive_search = flag
@@ -934,9 +1043,28 @@ class DistributedStrategy(object):
 
     @property
     def conv_workspace_size_limit(self):
+        """
+        The workspace limit size in MB unit for choosing cuDNN convolution algorithms.
+        The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit.
+        Usually, large workspace size may lead to choose faster algorithms,
+        but significant increasing memory workspace. Users need to trade-off between memory and speed.
+        Default Value: 4000
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.conv_workspace_size_limit = 1024
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        
+        """
         return self.strategy.conv_workspace_size_limit
 
     @conv_workspace_size_limit.setter
+    @is_strict_auto
     def conv_workspace_size_limit(self, value):
         if isinstance(value, int):
             self.strategy.conv_workspace_size_limit = value
@@ -947,9 +1075,26 @@ class DistributedStrategy(object):
 
     @property
     def cudnn_batchnorm_spatial_persistent(self):
+        """
+        Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm.
+        This is only useful in cudnn.
+        Default Value: True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.cudnn_batchnorm_spatial_persistent = True
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+
+        """
         return self.strategy.cudnn_batchnorm_spatial_persistent
 
     @cudnn_batchnorm_spatial_persistent.setter
+    @is_strict_auto
     def cudnn_batchnorm_spatial_persistent(self, flag):
         if isinstance(flag, bool):
             self.strategy.cudnn_batchnorm_spatial_persistent = flag
@@ -981,6 +1126,12 @@ class DistributedStrategy(object):
             if core.globals().is_public(key):
                 core.globals()[key] = values[i]
 
+    def _is_strict_auto(self):
+        global non_auto_func_called
+        if self.strategy.auto and non_auto_func_called:
+            return True
+        return False
+
     def __repr__(self):
         fields = self.strategy.DESCRIPTOR.fields
         for f in fields:
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 282ac29d6f9dafb4eb3b83471157464620326348..b9189492694f3a628843156cb329a43787e64ad2 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -13,7 +13,11 @@
 # limitations under the License.
 
 from __future__ import print_function
+import copy
+import warnings
 import paddle
+from paddle.fluid.framework import dygraph_only
+from paddle.fluid import compiler
 from .role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker, RoleMakerBase
 from .strategy_compiler import StrategyCompiler
 from .distributed_strategy import DistributedStrategy
@@ -21,6 +25,7 @@ from .meta_optimizer_factory import MetaOptimizerFactory
 from .runtime_factory import RuntimeFactory
 from .util_factory import UtilFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
+from paddle.fluid.dygraph import parallel_helper
 
 
 def _inited_runtime_handler_(func):
@@ -35,7 +40,24 @@ def _inited_runtime_handler_(func):
     return __impl__
 
 
+def _is_non_distributed_check_(func):
+    def __impl__(*args, **kwargs):
+        cls = args[0]
+
+        if cls._role_maker is not None and cls._role_maker._is_non_distributed(
+        ) is True:
+            warnings.warn(
+                "%s() function doesn't work when use non_distributed fleet." %
+                (func.__name__))
+            return
+
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
 inited_runtime_handler = wrap_decorator(_inited_runtime_handler_)
+is_non_distributed_check = wrap_decorator(_is_non_distributed_check_)
 
 
 class Fleet(object):
@@ -159,6 +181,12 @@ class Fleet(object):
                     "`role_maker` should be subclass of `RoleMakerBase`, but got {}".
                     format(type(role_maker)))
         self.strategy_compiler = StrategyCompiler()
+        if paddle.fluid.framework.in_dygraph_mode():
+            if parallel_helper._is_parallel_ctx_initialized():
+                warnings.warn(
+                    "The dygraph parallel environment has been initialized.")
+            else:
+                paddle.distributed.init_parallel_env()
         return None
 
     def is_first_worker(self):
@@ -367,6 +395,7 @@ class Fleet(object):
         """
         self._role_maker.barrier_worker()
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def init_worker(self):
         """
@@ -391,6 +420,7 @@ class Fleet(object):
         """
         self._runtime_handle._init_worker()
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def init_server(self, *args, **kwargs):
         """
@@ -416,6 +446,7 @@ class Fleet(object):
         """
         self._runtime_handle._init_server(*args, **kwargs)
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def run_server(self):
         """
@@ -440,6 +471,7 @@ class Fleet(object):
         """
         self._runtime_handle._run_server()
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def stop_worker(self):
         """
@@ -564,12 +596,344 @@ class Fleet(object):
 
         """
         self.user_defined_optimizer = optimizer
+        if paddle.fluid.framework.in_dygraph_mode():
+            return self
+
         if strategy == None:
             strategy = DistributedStrategy()
         self.user_defined_strategy = strategy
         self.valid_strategy = None
         return self
 
+    @dygraph_only
+    def distributed_model(self, model):
+        """
+        Return dygraph distributed data parallel model (Layer)
+        Only work in dygraph mode
+
+        Examples:
+            .. code-block:: python
+            import paddle
+            import paddle.nn as nn
+            from paddle.distributed import fleet
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+
+                # 2. initialize fleet environment
+                fleet.init(is_collective=True)
+
+                # 3. create layer & optimizer
+                layer = LinearNet()
+                loss_fn = nn.MSELoss()
+                adam = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=layer.parameters())
+
+                # 4. get data_parallel model using fleet
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                # 5. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+
+                print("loss:", loss.numpy())
+
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                paddle.distributed.spawn(train)
+        """
+        assert model is not None
+        self.model = paddle.DataParallel(model)
+        return self.model
+
+    @dygraph_only
+    def state_dict(self):
+        """
+        Get state dict information from optimizer.
+        Only work in dygraph mode
+
+        Returns: 
+            state_dict(dict) : dict contains all the Tensor used by optimizer
+
+        Examples:
+            .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.distributed import fleet
+
+            paddle.disable_static()
+            fleet.init(is_collective=True)
+
+            value = np.arange(26).reshape(2, 13).astype("float32")
+            a = paddle.fluid.dygraph.to_variable(value)
+
+            layer = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+            adam = fleet.distributed_optimizer(adam)
+            dp_layer = fleet.distributed_model(layer)
+            state_dict = adam.state_dict()
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.state_dict()
+
+    @dygraph_only
+    def set_state_dict(self, state_dict):
+        """
+        Load optimizer state dict.
+        Only work in dygraph mode
+
+        Args: 
+            state_dict(dict) : Dict contains all the Tensor needed by optimizer
+
+        Returns: None 
+
+        Examples:
+            .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.distributed import fleet
+
+            paddle.disable_static()
+            fleet.init(is_collective=True)
+
+            value = np.arange(26).reshape(2, 13).astype("float32")
+            a = paddle.fluid.dygraph.to_variable(value)
+
+            layer = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+            adam = fleet.distributed_optimizer(adam)
+            dp_layer = fleet.distributed_model(layer)
+            state_dict = adam.state_dict()
+            paddle.framework.save(state_dict, "paddle_dy")
+            para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
+            adam.set_state_dict(opti_state_dict)
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.set_state_dict(state_dict)
+
+    @dygraph_only
+    def set_lr(self, value):
+        """
+        Set the value of the learning rate manually in the optimizer. 
+        Only work in dygraph mode
+ 
+        Args:
+            value (float|Tensor): the value of learning rate
+
+        Returns: None 
+
+        Examples:
+            .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.distributed import fleet
+
+            paddle.disable_static()
+            fleet.init(is_collective=True)
+
+            value = np.arange(26).reshape(2, 13).astype("float32")
+            a = paddle.fluid.dygraph.to_variable(value)
+
+            layer = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+            adam = fleet.distributed_optimizer(adam)
+            dp_layer = fleet.distributed_model(layer)
+
+            lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+            for i in range(5):
+                adam.set_lr(lr_list[i])
+                lr = adam.get_lr()
+                print("current lr is {}".format(lr))
+            # Print:
+            #    current lr is 0.2
+            #    current lr is 0.3
+            #    current lr is 0.4
+            #    current lr is 0.5
+            #    current lr is 0.6
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.set_lr(value)
+
+    @dygraph_only
+    def get_lr(self):
+        """
+        Get current step learning rate.
+        Only work in dygraph mode
+
+        Returns:
+            float: The learning rate of the current step.
+
+        Examples:
+            .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.distributed import fleet
+
+            paddle.disable_static()
+            fleet.init(is_collective=True)
+
+            value = np.arange(26).reshape(2, 13).astype("float32")
+            a = paddle.fluid.dygraph.to_variable(value)
+
+            layer = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+            adam = fleet.distributed_optimizer(adam)
+            dp_layer = fleet.distributed_model(layer)
+
+            lr = adam.get_lr()
+            print(lr) # 0.01
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.get_lr()
+
+    @dygraph_only
+    def step(self):
+        """
+        Execute the optimizer once.
+        Only work in dygraph mode
+
+        Returns: None
+
+        Examples:
+            .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            from paddle.distributed import fleet
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+
+                # 2. initialize fleet environment
+                fleet.init(is_collective=True)
+
+                # 3. create layer & optimizer
+                layer = LinearNet()
+                loss_fn = nn.MSELoss()
+                adam = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=layer.parameters())
+
+                # 4. get data_parallel model using fleet
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                # 5. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+
+                print("loss:", loss.numpy())
+
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                paddle.distributed.spawn(train)
+
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.step()
+
+    @dygraph_only
+    def clear_grad(self):
+        """
+        Execute the optimizer once.
+        Only work in dygraph mode
+ 
+        Returns: None
+
+        Examples:
+            .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            from paddle.distributed import fleet
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train():
+                # 1. enable dynamic mode
+                paddle.disable_static()
+
+                # 2. initialize fleet environment
+                fleet.init(is_collective=True)
+
+                # 3. create layer & optimizer
+                layer = LinearNet()
+                loss_fn = nn.MSELoss()
+                adam = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=layer.parameters())
+
+                # 4. get data_parallel model using fleet
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                # 5. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+
+                print("loss:", loss.numpy())
+
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+            if __name__ == '__main__':
+                paddle.distributed.spawn(train)
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.clear_grad()
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -593,8 +957,8 @@ class Fleet(object):
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) variable pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
@@ -619,6 +983,11 @@ class Fleet(object):
                 # for more examples, please reference https://github.com/PaddlePaddle/FleetX
 
         """
+        if paddle.fluid.framework.in_dygraph_mode():
+            # imitate target optimizer retrieval
+            target_opt = self.user_defined_optimizer
+            return target_opt.minimize(loss)
+
         context = {}
         # cache original feed forward program
         self.origin_main_program = loss.block.program
@@ -640,6 +1009,18 @@ class Fleet(object):
             MetaOptimizerFactory()._get_valid_meta_optimizers(
                 self.user_defined_optimizer)
 
+        context["user_defined_strategy"] = copy.copy(self.user_defined_strategy)
+
+        # trigger the auto-parallel in very strict condition
+        # strategy = DistributedStrategy()
+        # strategy.auto = True
+        # optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+        # optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        if self.user_defined_strategy._is_strict_auto():
+            # turn on all the strategy for each optimizer
+            for opt in distributed_optimizer_list:
+                opt._enable_strategy(self.user_defined_strategy)
+
         valid_optimizer_list = []
         valid_graph_optimizer_list = []
         can_not_apply_optimizer_list = []
@@ -672,6 +1053,20 @@ class Fleet(object):
         optimize_ops = []
         params_grads = []
 
+        if self._role_maker._is_non_distributed() and not self._is_collective:
+            if self._runtime_handle is None:
+                self._runtime_handle = RuntimeFactory()._create_runtime(context)
+
+            compiled_program = compiler.CompiledProgram(
+                self.origin_main_program).with_data_parallel(
+                    loss_name=loss.name, share_vars_from=None)
+            loss.block.program._graph = compiled_program
+            return self.user_defined_optimizer.minimize(
+                loss,
+                startup_program=startup_program,
+                parameter_list=parameter_list,
+                no_grad_set=no_grad_set)
+
         if meta_optimizer:
             optimize_ops, params_grads = meta_optimizer.minimize(
                 loss,
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 25f2d0dd3f45855d9f337c6b7154db9cb5bbae45..8614b1861343b8e48b55a8e75d9e432ef6329184 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -232,6 +232,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._node_type_comm = None
         self._all_comm = None
 
+        self._non_distributed = False
+
         if not self._is_collective:
             self._hdfs_name = kwargs.get("hdfs_name", "")
             self._hdfs_ugi = kwargs.get("hdfs_ugi", "")
@@ -373,6 +375,15 @@ class PaddleCloudRoleMaker(RoleMakerBase):
             self.generate_role()
         return self._server_endpoints
 
+    def _is_non_distributed(self):
+        """
+        Return True if indispensable environment for fleetrun is not found
+        (use python-run to launch fleet-code directly)
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._non_distributed
+
     def _heter_worker_num(self):
         """
         get heter worker nums
@@ -409,13 +420,22 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         try:
             # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
             # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
-            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST",
-                                               "").split(",")
-            assert self._server_endpoints != ""
+            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
             self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                                "").split(",")
-            assert self._server_endpoints != ""
-
+            if self._server_endpoints is None:
+                # back to non_distributed execution.
+                self._server_endpoints = ""
+                self._trainers_num = 1
+                self._role = Role.WORKER
+                self._current_id = 0
+                self._node_num = 1
+                self._heter_trainers_num = 0
+                self._heter_trainer_endpoints = None
+                self._non_distributed = True
+                return
+
+            self._server_endpoints = self._server_endpoints.split(",")
             trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
             training_role = os.environ["TRAINING_ROLE"]
 
@@ -488,7 +508,11 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         assert (self._training_role == "TRAINER")
         self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
         self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-        assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
+        if self._worker_endpoints is None:
+            # back to non_distributed execution.
+            self._worker_endpoints = "127.0.0.1:6170"
+            self._cur_endpoint = self._worker_endpoints
+            self._non_distributed = True
         self._worker_endpoints = self._worker_endpoints.split(",")
         self._trainers_num = len(self._worker_endpoints)
         self._node_num = len(
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 29a1bda92f17443e6c38b070379481aaa419b1d4..7778acaf83b310cfa9a04059ce6d3be2d5326089 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -200,11 +200,11 @@ def launch_collective(args):
         start_port = os.environ.get('FLAGS_START_PORT')
     if cloud_utils.use_paddlecloud() and trainers_num != 1:
         cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus, start_port)
-        logger.info("get cluster from cloud:{}".format(cluster))
+        logger.debug("get cluster from cloud:{}".format(cluster))
     else:
         # trainers_num = 1 or not use paddlecloud ips="a,b"
         cluster, pod = get_cluster_from_args(args, gpus)
-        logger.info("get cluster from args:{}".format(cluster))
+        logger.debug("get cluster from args:{}".format(cluster))
 
     procs = start_local_trainers(
         cluster,
@@ -217,7 +217,8 @@ def launch_collective(args):
         alive = watch_local_trainers(procs, cluster.trainers_nranks())
 
         if not alive:
-            logger.info("Local procs complete, POD info:{}".format(pod))
+            logger.info("Local processes completed.")
+            logger.debug("POD info:{}".format(pod))
             break
 
         time.sleep(3)
@@ -313,18 +314,26 @@ def launch_ps(args):
     cmds = []
     log_fns = []
     for idx, cur_server in enumerate(pod.servers):
-        current_env.update({
+        proc_env = {
             "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
             "PADDLE_PORT": cur_server.endpoint.split(":")[1],
             "TRAINING_ROLE": "PSERVER",
             "PADDLE_TRAINERS_NUM": str(worker_num),
             "POD_IP": cur_server.endpoint.split(":")[0]
-        })
+        }
+        current_env.update(proc_env)
 
         cmd = [sys.executable, "-u", args.training_script
                ] + args.training_script_args
         cmds.append(cmd)
 
+        if idx == 0:
+            logger.info(
+                "Local server start {} processes. First process distributed "
+                "environment info (Only For Debug): {}".format(
+                    len(pod.servers),
+                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
+
         if args.log_dir is not None:
             os.system("mkdir -p {}".format(args.log_dir))
             fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w")
@@ -338,21 +347,32 @@ def launch_ps(args):
         tp.rank = cur_server.rank
         tp.local_rank = idx
         tp.log_fn = fn
-        tp.log_offset = 0 if fn else None
+        tp.log_offset = fn.tell() if fn else None
         tp.cmd = cmd
 
         procs.append(tp)
 
     for idx, cur_worker in enumerate(pod.workers):
-        current_env.update({
+        proc_env = {
             "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
+            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
             "PADDLE_TRAINERS_NUM": str(worker_num),
             "TRAINING_ROLE": "TRAINER",
             "PADDLE_TRAINER_ID": str(cur_worker.rank)
-        })
+        }
+        current_env.update(proc_env)
+
         cmd = [sys.executable, "-u", args.training_script
                ] + args.training_script_args
         cmds.append(cmd)
+
+        if idx == 0:
+            logger.info(
+                "Local worker start {} processes. First process distributed "
+                "environment info (Only For Debug): {}".format(
+                    len(pod.workers),
+                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
+
         if args.log_dir is not None:
             os.system("mkdir -p {}".format(args.log_dir))
             fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w")
@@ -366,11 +386,14 @@ def launch_ps(args):
         tp.rank = cur_worker.rank
         tp.local_rank = idx
         tp.log_fn = fn
-        tp.log_offset = 0 if fn else None
+        tp.log_offset = fn.tell() if fn else None
         tp.cmd = cmd
 
         procs.append(tp)
 
+    logger.info(
+        "Please check servers and workers logs in {}/workerlog.* and {}/serverlog.*".
+        format(args.log_dir, args.log_dir))
     # only wait worker to finish here
     for i, proc in enumerate(procs):
         if i < len(pod.servers):
@@ -403,16 +426,16 @@ def launch():
     cuda_device_num = fluid.core.get_cuda_device_count()
     if len(has_ps_args) > 0 or cuda_device_num == 0:
         logger.info(
-            "Run parameter-sever cpu mode. pserver args:{}, cuda count:{}".
+            "Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".
             format(has_ps_args, cuda_device_num))
         launch_ps(args)
     elif len(has_collective_args) > 0:
-        logger.info("Run collective gpu mode. gpu args:{}, cuda count:{}".
+        logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}".
                     format(has_collective_args, cuda_device_num))
         launch_collective(args)
     else:
         logger.warning(
-            "Not found distinct args. Default use gpu collective mode")
+            "Not found distinct arguments. Default use gpu collective mode")
         launch_collective(args)
 
 
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 350d8ae2b44db3e8f8e6b00d95c2b7a9ca91f88b..3da5aed8201ace6ccf9eed1ff322a7c6304de4a6 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -253,7 +253,8 @@ def terminate_local_procs(procs):
     for p in procs:
         if p.proc.poll() is None:
             p.proc.terminate()
-            p.log_fn.close()
+            if p.log_fn:
+                p.log_fn.close()
             logger.debug("terminate process id:{}".format(p.proc.pid))
 
     #wait all process terminiated
@@ -338,6 +339,45 @@ def get_ports(num, offset):
     return ports
 
 
+def pretty_print_envs(envs, header=None):
+    spacing = 2
+    max_k = 40
+    max_v = 45
+
+    for k, v in envs.items():
+        max_k = max(max_k, len(k))
+
+    h_format = "{{:^{}s}}{}{{:<{}s}}\n".format(max_k, " " * spacing, max_v)
+    l_format = "{{:<{}s}}{{}}{{:<{}s}}\n".format(max_k, max_v)
+    length = max_k + max_v + spacing
+
+    border = "".join(["="] * length)
+    line = "".join(["-"] * length)
+
+    draws = ""
+    draws += border + "\n"
+
+    if header:
+        draws += h_format.format(header[0], header[1])
+    else:
+        draws += h_format.format("fleetrun Distributed Envs", "Value")
+
+    draws += line + "\n"
+
+    for k, v in envs.items():
+        if isinstance(v, str) and len(v) >= max_v:
+            str_v = "... " + v[-41:]
+        else:
+            str_v = v
+
+        draws += l_format.format(k, " " * spacing, str(str_v))
+
+    draws += border
+
+    _str = "\n{}\n".format(draws)
+    return _str
+
+
 class TrainerProc(object):
     def __init__(self):
         self.proc = None
@@ -373,11 +413,19 @@ def start_local_trainers(cluster,
 
         current_env.update(proc_env)
 
-        logger.debug("trainer proc env:{}".format(current_env))
-
         cmd = [sys.executable, "-u", training_script] + training_script_args
 
-        logger.info("start trainer proc:{} env:{}".format(cmd, proc_env))
+        logger.debug("start trainer proc{}  env:{}".format(cmd, current_env))
+
+        if idx == 0:
+            logger.info("Local start {} processes. First process distributed "
+                        "environment info (Only For Debug): {}".format(
+                            len(pod.trainers),
+                            pretty_print_envs(proc_env, ("Distributed Envs",
+                                                         "Value"))))
+            logger.info(
+                "More details for debug about commands and environments are written in {}/run.sh".
+                format(log_dir))
 
         fn = None
         if log_dir is not None:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index b1952276e44cd1466bc443440505462924115ab7..938bd258847e72e43044f2e4f5550a86e064eae5 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -42,6 +42,17 @@ class AMPOptimizer(MetaOptimizerBase):
         dist_strategy.amp = False
         dist_strategy.amp_configs = {}
 
+    def _enable_strategy(self, dist_strategy):
+        dist_strategy.amp = True
+        dist_strategy.amp_configs = {
+            "init_loss_scaling": 32768.0,
+            "incr_every_n_steps": 1000,
+            "decr_every_n_nan_or_inf": 2,
+            "incr_ratio": 2.0,
+            "decr_ratio": 8.0,
+            "use_dynamic_loss_scaling": True
+        }
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index f1c6defc5c982c7d56980642898aaa333c199bbe..d292f58456c3ad91d8ef2e2ddc4770b50d71cdfd 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -69,6 +69,10 @@ class DGCOptimizer(MetaOptimizerBase):
         dist_strategy.dgc = False
         dist_strategy.dgc_configs = {}
 
+    def _enable_strategy(self, dist_strategy):
+        dist_strategy.dgc = True
+        dist_strategy.dgc_configs = {"rampup_begin_step": 0, "rampup_step": 1}
+
     def backward(self,
                  loss,
                  startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index 7db79ad7b5b7081172209faa2396d9f2a31bbdb3..bb0c631e081971461655429c1415ec619a9f9dbc 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -45,6 +45,10 @@ class GradientMergeOptimizer(MetaOptimizerBase):
         dist_strategy.gradient_merge = False
         dist_strategy.gradient_merge_configs = {}
 
+    def _enable_strategy(self, dist_strategy):
+        # we currently do not support auto-enable gradient merge
+        return
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index ace31687338f918ef260b3134b0bd429795542d0..03304f1b68b85f4bdf0452d0ebe88e2f46e2c94e 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -148,9 +148,6 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
 
         sync_allreduce = dist_strategy.sync_nccl_allreduce
         if sync_allreduce:
-            paddle.fluid.framework.set_flags({
-                "FLAGS_sync_nccl_allreduce": True
-            })
             exe_strategy.num_threads = local_build_strategy.nccl_comm_num + 1
             if local_build_strategy.use_hierarchical_allreduce:
                 exe_strategy.num_threads = 2 * local_build_strategy.nccl_comm_num + 1
@@ -191,7 +188,11 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         # TODO(guru4elephant): should close all PE related flags here
-        pass
+        return
+
+    def _enable_strategy(self, dist_strategy):
+        # by default, graph execution strategy is enabled
+        return
 
     def minimize(self,
                  loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index 9fa29c4078e9f579a740ef8c0591979e7fbb962d..3a9f2be533b8bc176b2361eaffbc74d4b834749c 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -75,6 +75,13 @@ class LambOptimizer(MetaOptimizerBase):
         dist_strategy.lamb = False
         dist_strategy.lamb_configs = {}
 
+    def _enable_strategy(self, dist_strategy):
+        dist_strategy.lamb = True
+        dist_strategy.lamb_configs = {
+            "lamb_weight_decay": 0.01,
+            "exclude_from_weight_decay": []
+        }
+
     def backward(self,
                  loss,
                  startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index a7b856ff5b0dcb1ab30de82a12c91a2e1c14fe76..cb12154ddc564687539d953c21b9e0597a8bf893 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -59,6 +59,13 @@ class LarsOptimizer(MetaOptimizerBase):
         dist_strategy.lars = False
         dist_strategy.lars_configs = {}
 
+    def _enable_strategy(self, dist_strategy):
+        dist_strategy.lars = True
+        dist_strategy.lars_configs = {
+            "lars_coeff": 0.01,
+            "lars_weight_decay": 0.0005,
+        }
+
     def backward(self,
                  loss,
                  startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index e22127c13999bfde7aa753ad1a66536913ab04f9..3c1318301bb37bea71b896c220eb4a2090b334bf 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -14,8 +14,8 @@
 
 from __future__ import print_function
 
+import paddle
 from paddle.fluid import program_guard, layers, default_main_program
-from paddle.fluid.optimizer import Momentum, SGD
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, CollectiveHelper, is_update_op
 
@@ -35,13 +35,19 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         if self.role_maker.worker_num() <= 1:
             return False
 
-        return isinstance(self.inner_opt, Momentum) \
-                or isinstance(self.inner_opt, SGD)
+        return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
+                or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
+                or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
+                or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.localsgd = False
         dist_strategy.localsgd_configs = {}
 
+    def _enable_strategy(self, dist_strategy):
+        dist_strategy.localsgd = True
+        dist_strategy.localsgd_configs = {"k_steps": 1}
+
     def snapshot_name(self, param_name):
         return param_name + self.snapshot_key
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index 073148e11a0a2b08253b89d36d7a014b830518f8..b105c25b3ad65c1c3a3fdac5b69af3c9e728c251 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -48,6 +48,10 @@ class MetaOptimizerBase(Optimizer):
         raise NotImplementedError("you should implement disable strategy in {}".
                                   format(type(self).__name__))
 
+    def _enable_strategy(self, dist_strategy):
+        raise NotImplementedError("you should implement enable strategy in {}".
+                                  format(type(self).__name__))
+
     def apply_gradients(self, params_grads):
         return self.inner_opt.apply_gradients(params_grads=params_grads)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
index 878ed7422d733d3e2828e0395ec63ed16b4c489a..c9260dd2f8c9d0d073b9453bc575cc3e2a8aa437 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -39,6 +39,11 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
     def _disable_strategy(self, dist_strategy):
         dist_strategy.a_sync_configs = {}
 
+    def _enable_strategy(self, dist_strategy):
+        # only open up the async mode for auto-parallel
+        dist_strategy.a_sync = True
+        dist_strategy.a_sync_configs = {}
+
     def _is_graph_out(self):
         return True
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index ecb198bedf9041aa3ffc929a72cce3c209f03b61..f394a792e3a5750b13da65f50d84e3a0f516f617 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -157,4 +157,9 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         return None, None
 
     def _disable_strategy(self, dist_strategy):
+        dist_strategy.a_sync_configs = {}
         self.user_defined_strategy.a_sync_configs = {}
+
+    def _enable_strategy(self, dist_strategy):
+        dist_strategy.a_sync = True
+        dist_strategy.a_sync_configs = {}
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index d5a45e2b4e1aeda2e1c66c0a5a36236622f093ec..32c54d44867cc1b081d97c8f86d88b6613b30c2f 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -111,6 +111,10 @@ class PipelineOptimizer(MetaOptimizerBase):
         dist_strategy.pipeline = False
         dist_strategy.pipeline_configs = {}
 
+    def _enable_strategy(self, dist_strategy):
+        # we do not support enable pipeline automatically right now
+        return
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 3eb3ca6127cfe0d0a7a458c6c44e09ce22e7b24a..267656824c9acea2b85341ae284d8634c922a095 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -49,6 +49,10 @@ class RecomputeOptimizer(MetaOptimizerBase):
         dist_strategy.recompute = False
         dist_strategy.recompute_configs = {}
 
+    def _enable_strategy(self, dist_strategy):
+        # we do not support automatically recompute checkpoints currently
+        return
+
     def backward(self,
                  loss,
                  startup_program=None,
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 353acf5386f0d5a63f1c0355ed9b571d9b0ec15b..5d882e0c122d62296cdbee4bc6dda2093e183d67 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -202,6 +202,9 @@ class ParameterServerRuntime(RuntimeBase):
             if self.role_maker._get_heter_worker_device() == "GPU":
                 gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
                 executor = Executor(fluid.CUDAPlace(gpu_id))
+            elif self.role_maker._get_heter_worker_device() == "XPU":
+                xpu_id = int(os.getenv("FLAGS_selected_xpus", "0"))
+                executor = Executor(fluid.XPUPlace(xpu_id))
             else:
                 raise ValueError("Not Support Device {}".format(
                     self.role_maker._get_heter_worker_device()))
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index 49e98805d24f3f8f5dc1cfcbf3ddc8d9fb835fde..35204affb3fd168b8bd137d78c3413a08885e2bb 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -102,21 +102,24 @@ class Distribution(object):
         tmp = 0.
 
         for arg in args:
-            valid_arg = False
-            for cls in [float, list, np.ndarray, tensor.Variable]:
-                if isinstance(arg, cls):
-                    valid_arg = True
-                    break
-            assert valid_arg, "type of input args must be float, list, numpy.ndarray or Tensor."
             if isinstance(arg, float):
-                arg = np.zeros(1) + arg
+                arg = [arg]
+            if not isinstance(arg, (list, np.ndarray, tensor.Variable)):
+                raise TypeError(
+                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".
+                    format(type(arg)))
+
             arg_np = np.array(arg)
             arg_dtype = arg_np.dtype
-            if str(arg_dtype) not in ['float32']:
-                warnings.warn(
-                    "data type of argument only support float32, your argument will be convert to float32."
-                )
+            if str(arg_dtype) != 'float32':
+                if str(arg_dtype) != 'float64':
+                    # "assign" op doesn't support float64. if dtype is float64, float32 variable will be generated
+                    #  and converted to float64 later using "cast".
+                    warnings.warn(
+                        "data type of argument only support float32 and float64, your argument will be convert to float32."
+                    )
                 arg_np = arg_np.astype('float32')
+            # tmp is used to support broadcast, it summarizes shapes of all the args and get the mixed shape.
             tmp = tmp + arg_np
             numpy_args.append(arg_np)
 
@@ -129,6 +132,37 @@ class Distribution(object):
 
         return tuple(variable_args)
 
+    def _check_values_dtype_in_probs(self, param, value):
+        """
+        Log_prob and probs methods have input ``value``, if value's dtype is different from param,
+        convert value's dtype to be consistent with param's dtype.
+
+        Args:
+            param (Tensor): low and high in Uniform class, loc and scale in Normal class.
+            value (Tensor): The input tensor.
+
+        Returns:
+            value (Tensor): Change value's dtype if value's dtype is different from param.
+        """
+        if in_dygraph_mode():
+            if value.dtype != param.dtype and convert_dtype(
+                    value.dtype) in ['float32', 'float64']:
+                warnings.warn(
+                    "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
+                )
+                return core.ops.cast(value, 'in_dtype', value.dtype,
+                                     'out_dtype', param.dtype)
+            return value
+
+        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                 'log_prob')
+        if value.dtype != param.dtype:
+            warnings.warn(
+                "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
+            )
+            return tensor.cast(value, dtype=param.dtype)
+        return value
+
 
 class Uniform(Distribution):
     """Uniform distribution with `low` and `high` parameters.
@@ -155,8 +189,8 @@ class Uniform(Distribution):
     [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
 
     Args:
-        low(int|float|list|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
-        high(int|float|list|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
+        low(int|float|list|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
+        high(int|float|list|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -206,6 +240,7 @@ class Uniform(Distribution):
         self.all_arg_is_float = False
         self.batch_size_unknown = False
         self.name = name if name is not None else 'Uniform'
+        self.dtype = 'float32'
 
         if isinstance(low, int):
             low = float(low)
@@ -216,10 +251,22 @@ class Uniform(Distribution):
             self.batch_size_unknown = True
             self.low = low
             self.high = high
+            self.dtype = convert_dtype(low.dtype)
         else:
             if isinstance(low, float) and isinstance(high, float):
                 self.all_arg_is_float = True
+            if isinstance(
+                    low,
+                    np.ndarray) and str(low.dtype) in ['float32', 'float64']:
+                self.dtype = low.dtype
+            elif isinstance(
+                    high,
+                    np.ndarray) and str(high.dtype) in ['float32', 'float64']:
+                self.dtype = high.dtype
             self.low, self.high = self._to_tensor(low, high)
+            if self.dtype != convert_dtype(self.low.dtype):
+                self.low = tensor.cast(self.low, dtype=self.dtype)
+                self.high = tensor.cast(self.high, dtype=self.dtype)
 
     def sample(self, shape, seed=0):
         """Generate samples of the specified shape.
@@ -241,11 +288,11 @@ class Uniform(Distribution):
         if self.batch_size_unknown:
             output_shape = shape + batch_shape
             zero_tmp = tensor.fill_constant_batch_size_like(
-                self.low + self.high, batch_shape + shape, self.low.dtype, 0.)
+                self.low + self.high, batch_shape + shape, self.dtype, 0.)
             uniform_random_tmp = nn.uniform_random_batch_size_like(
                 zero_tmp,
                 zero_tmp.shape,
-                dtype=convert_dtype(zero_tmp.dtype),
+                dtype=self.dtype,
                 min=0.,
                 max=1.,
                 seed=seed)
@@ -259,9 +306,8 @@ class Uniform(Distribution):
         else:
             output_shape = shape + batch_shape
             output = nn.uniform_random(
-                output_shape, seed=seed) * (tensor.zeros(
-                    output_shape, dtype=self.low.dtype) +
-                                            (self.high - self.low))
+                output_shape, seed=seed, dtype=self.dtype) * (tensor.zeros(
+                    output_shape, dtype=self.dtype) + (self.high - self.low))
             output = elementwise_add(output, self.low, name=name)
             if self.all_arg_is_float:
                 return nn.reshape(output, shape, name=name)
@@ -279,22 +325,20 @@ class Uniform(Distribution):
 
         """
         name = self.name + '_log_prob'
+        value = self._check_values_dtype_in_probs(self.low, value)
         if in_dygraph_mode():
+            # ensure value in [low, high]
             lb_bool = self.low < value
             ub_bool = value < self.high
 
-            dtype = value.dtype
             lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             return nn.log(lb * ub) - nn.log(self.high - self.low)
 
-        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                 'log_prob')
-
-        lb_bool = control_flow.less_than(self.low, value)
-        ub_bool = control_flow.less_than(value, self.high)
+        lb_bool = self.low < value
+        ub_bool = value < self.high
         lb = tensor.cast(lb_bool, dtype=value.dtype)
         ub = tensor.cast(ub_bool, dtype=value.dtype)
         return elementwise_sub(
@@ -311,22 +355,19 @@ class Uniform(Distribution):
 
         """
         name = self.name + '_probs'
+        value = self._check_values_dtype_in_probs(self.low, value)
         if in_dygraph_mode():
             lb_bool = self.low < value
             ub_bool = value < self.high
 
-            dtype = value.dtype
             lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             return (lb * ub) / (self.high - self.low)
 
-        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                 'log_prob')
-
-        lb_bool = control_flow.less_than(self.low, value)
-        ub_bool = control_flow.less_than(value, self.high)
+        lb_bool = self.low < value
+        ub_bool = value < self.high
         lb = tensor.cast(lb_bool, dtype=value.dtype)
         ub = tensor.cast(ub_bool, dtype=value.dtype)
         return elementwise_div((lb * ub), (self.high - self.low), name=name)
@@ -334,6 +375,12 @@ class Uniform(Distribution):
     def entropy(self):
         """Shannon entropy in nats.
 
+        The entropy is
+
+        .. math::
+
+            entropy(low, high) = \\log (high - low)
+
         Returns:
           Tensor: Shannon entropy of uniform distribution.The data type is float32.
 
@@ -364,8 +411,8 @@ class Normal(Distribution):
     * :math:`Z`: is the normalization constant.
 
     Args:
-        loc(int|float|list|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
-        scale(int|float|list|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
+        loc(int|float|list|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        scale(int|float|list|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -418,6 +465,7 @@ class Normal(Distribution):
         self.batch_size_unknown = False
         self.all_arg_is_float = False
         self.name = name if name is not None else 'Normal'
+        self.dtype = 'float32'
 
         if isinstance(loc, int):
             loc = float(loc)
@@ -428,10 +476,22 @@ class Normal(Distribution):
             self.batch_size_unknown = True
             self.loc = loc
             self.scale = scale
+            self.dtype = convert_dtype(loc.dtype)
         else:
             if isinstance(loc, float) and isinstance(scale, float):
                 self.all_arg_is_float = True
+            if isinstance(
+                    loc,
+                    np.ndarray) and str(loc.dtype) in ['float32', 'float64']:
+                self.dtype = loc.dtype
+            elif isinstance(
+                    scale,
+                    np.ndarray) and str(scale.dtype) in ['float32', 'float64']:
+                self.dtype = scale.dtype
             self.loc, self.scale = self._to_tensor(loc, scale)
+            if self.dtype != convert_dtype(self.loc.dtype):
+                self.loc = tensor.cast(self.loc, dtype=self.dtype)
+                self.scale = tensor.cast(self.scale, dtype=self.dtype)
 
     def sample(self, shape, seed=0):
         """Generate samples of the specified shape.
@@ -454,22 +514,18 @@ class Normal(Distribution):
         if self.batch_size_unknown:
             output_shape = shape + batch_shape
             zero_tmp = tensor.fill_constant_batch_size_like(
-                self.loc + self.scale, batch_shape + shape, self.loc.dtype, 0.)
+                self.loc + self.scale, batch_shape + shape, self.dtype, 0.)
             zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
             zero_tmp_shape = nn.shape(zero_tmp_reshape)
             normal_random_tmp = nn.gaussian_random(
-                zero_tmp_shape,
-                mean=0.,
-                std=1.,
-                seed=seed,
-                dtype=convert_dtype(self.loc.dtype))
+                zero_tmp_shape, mean=0., std=1., seed=seed, dtype=self.dtype)
             output = normal_random_tmp * (zero_tmp_reshape + self.scale)
             output = elementwise_add(output, self.loc, name=name)
             return output
         else:
             output_shape = shape + batch_shape
-            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed) * \
-                     (tensor.zeros(output_shape, dtype=self.loc.dtype) + self.scale)
+            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed, dtype=self.dtype) * \
+                     (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
             output = elementwise_add(output, self.loc, name=name)
             if self.all_arg_is_float:
                 return nn.reshape(output, shape, name=name)
@@ -479,6 +535,16 @@ class Normal(Distribution):
     def entropy(self):
         """Shannon entropy in nats.
 
+        The entropy is
+
+        .. math::
+
+            entropy(\sigma) = 0.5 \\log (2 \pi e \sigma^2)
+
+        In the above equation:
+
+        * :math:`scale = \sigma`: is the std.
+
         Returns:
           Tensor: Shannon entropy of normal distribution.The data type is float32.
 
@@ -486,7 +552,7 @@ class Normal(Distribution):
         name = self.name + '_entropy'
         batch_shape = list((self.loc + self.scale).shape)
         zero_tmp = tensor.fill_constant_batch_size_like(
-            self.loc + self.scale, batch_shape, self.loc.dtype, 0.)
+            self.loc + self.scale, batch_shape, self.dtype, 0.)
         return elementwise_add(
             0.5 + zero_tmp,
             0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
@@ -502,11 +568,9 @@ class Normal(Distribution):
           Tensor: log probability.The data type is same with value.
 
         """
-        if not in_dygraph_mode():
-            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                     'log_prob')
-
         name = self.name + '_log_prob'
+        value = self._check_values_dtype_in_probs(self.loc, value)
+
         var = self.scale * self.scale
         log_scale = nn.log(self.scale)
         return elementwise_sub(
@@ -524,11 +588,9 @@ class Normal(Distribution):
           Tensor: probability.The data type is same with value.
 
         """
-        if not in_dygraph_mode():
-            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                     'log_prob')
-
         name = self.name + '_probs'
+        value = self._check_values_dtype_in_probs(self.loc, value)
+
         var = self.scale * self.scale
         return elementwise_div(
             ops.exp(-1. * ((value - self.loc) * (value - self.loc)) /
@@ -538,6 +600,29 @@ class Normal(Distribution):
     def kl_divergence(self, other):
         """The KL-divergence between two normal distributions.
 
+        The probability density function (pdf) is
+
+        .. math::
+
+            KL\_divergence(\mu_0, \sigma_0; \mu_1, \sigma_1) = 0.5 (ratio^2 + (\\frac{diff}{\sigma_1})^2 - 1 - 2 \\ln {ratio})
+
+        .. math::
+
+            ratio = \\frac{\sigma_0}{\sigma_1}
+        
+        .. math::
+
+            diff = \mu_1 - \mu_0
+
+        In the above equation:
+
+        * :math:`loc = \mu_0`: is the mean of current Normal distribution.
+        * :math:`scale = \sigma_0`: is the std of current Normal distribution.
+        * :math:`loc = \mu_1`: is the mean of other Normal distribution.
+        * :math:`scale = \sigma_1`: is the std of other Normal distribution.
+        * :math:`ratio`: is the ratio of scales.
+        * :math:`diff`: is the difference between means.
+
         Args:
             other (Normal): instance of Normal.
 
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 93013ef8bf8442311621202e0a86dd65e7c38b30..328dafe6219adb3c6355de0bafc430c52725024f 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -74,7 +74,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                 continue
         for in_var_name in op.input(in_name):
             in_var = block.var(in_var_name)
-            if in_var.type not in valid_types:
+            if in_var.type not in valid_types or in_var.dtype == dest_dtype:
                 continue
             if in_var.dtype == src_dtype:
                 cast_name = in_var.name + '.cast_' + _dtype_to_str(dest_dtype)
@@ -84,7 +84,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                         name=cast_name,
                         dtype=dest_dtype,
                         persistable=False,
-                        stop_gradient=False)
+                        stop_gradient=in_var.stop_gradient)
 
                     block._insert_op(
                         idx,
@@ -100,7 +100,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
             else:
                 if op.has_attr('in_dtype'):
                     op._set_attr('in_dtype', dest_dtype)
-    if src_dtype == core.VarDesc.VarType.FP32:
+    if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16:
         for out_name in op.output_names:
             if op.type == 'batch_norm' and out_name != 'Y':
                 continue
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 75e1ea43d15e432d2f6cbec271acd67624de1e01..dadc756c43ecc782a72c1c7d6626e00bc182f2c6 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -299,11 +299,14 @@ class Quant2Int8MkldnnPass(object):
         # Convert int8 range weights to fp32 range weights
         scales = self._weight_scales[output_var_name]
         weight = self._load_param(self._scope, weight_var_name)
-        assert scales.size == 1 or scales.size == len(
-            weight
-        ), "The size of weight scales vector ({}) does not match the number of output channels ({}) in the weights tensor {}.".format(
-            scales.size, len(weight), weight_var_name)
-        w_fp32 = np.divide(np.multiply(weight, self._s8_max).T, scales.T).T
+        if scales.size == 1 or scales.size == weight.shape[0]:
+            w_fp32 = np.divide(np.multiply(weight, self._s8_max).T, scales.T).T
+        elif len(weight.shape) > 1 and scales.size == weight.shape[1]:
+            w_fp32 = np.divide(np.multiply(weight, self._s8_max), scales)
+        else:
+            raise ValueError(
+                "The size of weight scales vector ({}) does not match the dimensions ({}) of the weights tensor {}."
+                .format(scales.size, weight.shape, weight_var_name))
         w_fp32 = w_fp32.reshape(weight.shape).astype(np.float32)
         self._restore_var(weight_var_name, w_fp32)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index fcbb1b66ad1fd73a152b9128fa75a152baecd223..7b51973131496172d61b7ad968417eb41fa11c08 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -43,7 +43,7 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
         self.conv_output = np.ndarray(self.conv_output_size).astype(self.dtype)
         self.conv_output2 = np.ndarray(self.conv_output2_size).astype(
             self.dtype)
-        self.quantized_ops = 'conv2d'
+        self.quantized_ops = 'conv2d,mul'
         self.variables = {
             "input": self.input,
             "filter": self.filter,
@@ -51,6 +51,22 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
             "conv_output": self.conv_output,
             "conv_output2": self.conv_output2,
         }
+        self.mul_input_size = [1, 3]
+        self.mul_weights_size = [3, 5]
+        self.mul_output_size = [1, 5]
+        self.mul_input = np.random.random(self.mul_input_size).astype(
+            self.dtype)
+        self.mul_weights = np.ones(self.mul_weights_size, self.dtype)
+        self.mul_weights_bad = np.ones([1, 1], self.dtype)
+        self.mul_output = np.ndarray(self.mul_output_size).astype(self.dtype)
+        self.mul_output_scale = np.linspace(1, 5, num=5).astype(self.dtype)
+
+        self.variables_mul = {
+            "mul_input": self.mul_input,
+            "mul_weights": self.mul_weights,
+            "mul_output": self.mul_output,
+            "mul_weights_bad": self.mul_weights_bad
+        }
 
     def prepare_program(self, program):
         block = program.global_block()
@@ -92,6 +108,23 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
                 'fuse_brelu': True
             })
 
+    def prepare_program_mul(self, program):
+        block = program.global_block()
+        for name in self.variables_mul:
+            block.create_var(
+                name=name,
+                dtype="float32",
+                shape=self.variables_mul[name].shape)
+
+        mul_op1 = block.append_op(
+            type="mul",
+            inputs={
+                "X": block.var('mul_input'),
+                "Y": block.var('mul_weights')
+            },
+            outputs={"Out": block.var('mul_output')},
+            attrs={'use_mkldnn': self.use_mkldnn})
+
     def remove_fuse_activation_attribute(self, graph):
         for op in graph.all_op_nodes():
             op.op().remove_attr("fuse_activation")
@@ -103,11 +136,13 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
 
     def check_graph_after_pass(self, graph):
         for op in graph.all_op_nodes():
-            self.assertTrue(op.op().has_attr("fuse_activation"))
-            if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
-                self.assertTrue(op.op().attr("fuse_activation") == "relu")
-            if op.op().has_attr("fuse_brelu") and op.op().attr("fuse_brelu"):
-                self.assertTrue(op.op().attr("fuse_activation") == "relu6")
+            if op.op().type() == "conv2d":
+                self.assertTrue(op.op().has_attr("fuse_activation"))
+                if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
+                    self.assertTrue(op.op().attr("fuse_activation") == "relu")
+                if op.op().has_attr("fuse_brelu") and op.op().attr(
+                        "fuse_brelu"):
+                    self.assertTrue(op.op().attr("fuse_activation") == "relu6")
 
     def test_quant_update_activation(self):
         program = fluid.Program()
@@ -125,6 +160,39 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
             graph = quant2_int8_mkldnn_pass._update_activations(graph)
             self.check_graph_after_pass(graph)
 
+    def test_dequantize_op_weights(self):
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            self.prepare_program_mul(program)
+            graph = IrGraph(core.Graph(program.desc), for_test=True)
+
+            for op in graph.all_op_nodes():
+                if op.op().type() == "mul":
+                    op_node = op
+                    break
+
+            qpass = Quant2Int8MkldnnPass(
+                self.quantized_ops,
+                _scope=self.scope,
+                _place=self.place,
+                _core=core,
+                _debug=False)
+            qpass._weight_scales["mul_output"] = self.mul_output_scale
+            param = self.scope.var("mul_weights").get_tensor()
+            param.set(self.variables_mul["mul_weights"], self.place)
+            qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
+
+            assert np.allclose(
+                self.scope.find_var("mul_weights").get_tensor(),
+                [[127, 63.5, 42.3333, 31.75, 25.4],
+                 [127, 63.5, 42.3333, 31.75, 25.4],
+                 [127, 63.5, 42.3333, 31.75, 25.4]])
+
+            param = self.scope.var("mul_weights").get_tensor()
+            param.set(self.variables_mul["mul_weights_bad"], self.place)
+            with self.assertRaises(ValueError):
+                qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 45aa85d4168a55e206460ce2e39292013caa9ce0..5da83da33b8de334d4ae1e5b072cfb20d74c1271 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -132,6 +132,28 @@ def check_dtype(input_dtype,
              extra_message))
 
 
+def check_shape(shape,
+                op_name,
+                expected_shape_type=(list, tuple, Variable),
+                expected_element_type=(int, Variable),
+                expected_tensor_dtype=('int32', 'int64')):
+    # See NOTE [ Why skip dynamic graph check ]
+    if in_dygraph_mode():
+        return
+    check_type(shape, 'shape', expected_shape_type, op_name)
+    if expected_element_type is not None and not isinstance(shape, Variable):
+        for item in shape:
+            check_type(item, 'element of shape', expected_element_type, op_name)
+            if expected_tensor_dtype is not None and isinstance(item, Variable):
+                check_dtype(
+                    item.dtype, 'element of shape', expected_tensor_dtype,
+                    op_name,
+                    'If element of shape is Tensor, its data type should be {}'.
+                    format(', '.join(expected_tensor_dtype)))
+    if expected_tensor_dtype is not None and isinstance(shape, Variable):
+        check_dtype(shape.dtype, 'shape', expected_tensor_dtype, op_name)
+
+
 class DataToLoDTensorConverter(object):
     def __init__(self, place, lod_level, shape, dtype):
         self.place = place
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index f4d68a798efa26d43702aa1c555f6046f0e6a6a5..30ded1f7eda295bab5567a082ba1fa3989b55fa2 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -16,13 +16,16 @@ from __future__ import print_function
 
 import os
 import collections
+import functools
 from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer
 import pickle
 import six
 from . import learning_rate_scheduler
 import warnings
 from .. import core
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME, _load_persistable_vars
+from .base import guard
+from paddle.fluid.dygraph.jit import SaveLoadConfig, deprecate_save_load_configs
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 
 __all__ = [
     'save_dygraph',
@@ -30,6 +33,37 @@ __all__ = [
 ]
 
 
+# NOTE(chenweihang): deprecate load_dygraph's argument keep_name_table,
+# ensure compatibility when user still use keep_name_table argument
+def deprecate_keep_name_table(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        def __warn_and_build_configs__(keep_name_table):
+            warnings.warn(
+                "The argument `keep_name_table` has deprecated, please use `SaveLoadConfig.keep_name_table`.",
+                DeprecationWarning)
+            config = SaveLoadConfig()
+            config.keep_name_table = keep_name_table
+            return config
+
+        # deal with arg `keep_name_table`
+        if len(args) > 1 and isinstance(args[1], bool):
+            args = list(args)
+            args[1] = __warn_and_build_configs__(args[1])
+        # deal with kwargs
+        elif 'keep_name_table' in kwargs:
+            kwargs['config'] = __warn_and_build_configs__(kwargs[
+                'keep_name_table'])
+            kwargs.pop('keep_name_table')
+        else:
+            # do nothing
+            pass
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
 @dygraph_only
 def save_dygraph(state_dict, model_path):
     '''
@@ -100,17 +134,28 @@ def save_dygraph(state_dict, model_path):
 
 # TODO(qingqing01): remove dygraph_only to support loading static model.
 # maybe need to unify the loading interface after 2.0 API is ready.
-#@dygraph_only
-def load_dygraph(model_path, keep_name_table=False):
+# @dygraph_only
+@deprecate_save_load_configs
+@deprecate_keep_name_table
+def load_dygraph(model_path, config=None):
     '''
     :api_attr: imperative
     
-    Load parameter state_dict from disk.
+    Load parameter state dict from disk.
+
+    .. note::
+        Due to some historical reasons, if you load ``state_dict`` from the saved 
+        result of `paddle.io.save_inference_model`, the structured variable name 
+        will cannot be restored. You need to set the argument `use_structured_name=False` 
+        when using `Layer.set_state_dict` later.
 
     Args:
-        model_path(str) : The file prefix store the state_dict. (The path should Not contain suffix '.pdparams') 
-        keep_name_table(bool, optional) : Whether keep structed name to parameter name conversion table in output dict. 
-                                          Default : False
+        model_path(str) : The file prefix store the state_dict. 
+            (The path should Not contain suffix '.pdparams') 
+        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
+            object that specifies additional configuration options, these options 
+            are for compatibility with ``jit.save/io.save_inference_model`` formats. 
+            Default None.
 
     Returns:
         state_dict(dict) : the dict store the state_dict
@@ -118,23 +163,27 @@ def load_dygraph(model_path, keep_name_table=False):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             
-            with fluid.dygraph.guard():
-                emb = fluid.dygraph.Embedding([10, 10])
+            paddle.disable_static()
 
-                state_dict = emb.state_dict()
-                fluid.save_dygraph( state_dict, "paddle_dy")
+            emb = paddle.nn.Embedding([10, 10])
 
-                adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000),
-                                             parameter_list = emb.parameters() )
-                state_dict = adam.state_dict()
-                fluid.save_dygraph( state_dict, "paddle_dy")
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, "paddle_dy")
 
-                para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy")
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(
+                d_model=0.01, warmup_steps=100, verbose=True)
+            adam = paddle.optimizer.Adam(
+                learning_rate=scheduler,
+                parameters=emb.parameters())
+            state_dict = adam.state_dict()
+            paddle.save(state_dict, "paddle_dy")
 
-    '''
+            para_state_dict, opti_state_dict = paddle.load("paddle_dy")
 
+    '''
+    # deal with argument `model_path`
     model_prefix = model_path
     if model_prefix.endswith(".pdparams"):
         model_prefix = model_prefix[:-9]
@@ -145,66 +194,45 @@ def load_dygraph(model_path, keep_name_table=False):
     opti_dict = None
     params_file_path = model_prefix + ".pdparams"
     opti_file_path = model_prefix + ".pdopt"
+
+    # deal with argument `configs`
+    configs = config
+    if configs is None:
+        configs = SaveLoadConfig()
+
     if not os.path.exists(params_file_path) and not os.path.exists(
             opti_file_path):
-        # Load state dict by `jit.save` save format
-        # TODO(chenweihang): [Why not support `io.save_infernece_model` save format here]
+        # Load state dict by `jit.save/io.save_inference_model` save format
+        # NOTE(chenweihang): [ Compatibility of save_inference_model save format ]
         # The model saved by `save_inference_model` does not completely correspond to 
         # the information required by the `state_dict` under the dygraph. 
-        # Although we reluctantly restore the `state_dict` in some scenarios, 
-        # this may not be complete and there are some limitations, so this function 
-        # will be considered later. The limitations include:
-        #   1. `save_inference_model` not save structured name, we need to remind 
-        # the user to configure the `use_structured_name` argument when `set_dict`, 
-        # but this argument is currently not public
-        #   2. if `save_inference_model` save all persistable variables in a single file,
-        # user need to give the variable name list to load `state_dict`
+        # `save_inference_model` not save structured name, we need to remind 
+        # the user to configure the `use_structured_name` argument when `set_state_dict`
+        # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
 
         # 1. check model path
         if not os.path.isdir(model_prefix):
             raise ValueError("Model saved directory '%s' is not exists." %
                              model_prefix)
-        # 2. load `__variables.info__`
-        var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
-        if not os.path.exists(var_info_path):
-            raise RuntimeError(
-                "No target can be loaded. Now only supports loading `state_dict` from "
-                "the result saved by `imperative.save` and `imperative.jit.save`."
-            )
-        with open(var_info_path, 'rb') as f:
-            extra_var_info = pickle.load(f)
-        # 3. load `__variables__`
-        # TODO(chenweihang): now only supports loading from default save format:
-        # - all persistable vars saved in one file named `__variables__`
-        # for other case, we may need to modify the arguments of this API
-        var_file_path = os.path.join(model_prefix, VARIABLE_FILENAME)
-        if not os.path.exists(var_file_path):
-            raise RuntimeError(
-                "The parameter file to be loaded was not found. "
-                "Now only supports loading from the default save format, "
-                "and does not support custom params_filename and "
-                "save parameters separately.")
-        # 4. load all persistable vars
-        load_var_list = []
-        for name in sorted(extra_var_info):
-            var = _varbase_creator(name=name, persistable=True)
-            load_var_list.append(var)
-        _dygraph_tracer().trace_op(
-            type='load_combine',
-            inputs={},
-            outputs={'Out': load_var_list},
-            attrs={'file_path': var_file_path})
-        # 5. construct state_dict
-        para_dict = dict()
-        for var in load_var_list:
-            structured_name = extra_var_info[var.name].get('structured_name',
-                                                           None)
-            if structured_name is None:
-                raise RuntimeError(
-                    "Cannot find saved variable (%s)'s structured name in saved model.",
-                    var.name)
-            para_dict[structured_name] = var.numpy()
-        # NOTE: `jit.save` doesn't save optimizer state
+
+        # 2. load program desc & construct _ProgramHolder
+        programs = _construct_program_holders(model_path,
+                                              configs.model_filename)
+
+        # 3. load layer parameters & buffers
+        # NOTE: using fluid.dygraph.guard() here will cause import error in py2
+        with guard():
+            persistable_var_dict = _construct_params_and_buffers(
+                model_prefix,
+                programs,
+                configs.separate_params,
+                configs.params_filename,
+                append_suffix=False)
+
+            # 4. construct state_dict
+            para_dict = dict()
+            for var_name in persistable_var_dict:
+                para_dict[var_name] = persistable_var_dict[var_name].numpy()
     else:
         # Load state dict by `save_dygraph` save format
         para_dict = {}
@@ -213,7 +241,7 @@ def load_dygraph(model_path, keep_name_table=False):
                 para_dict = pickle.load(f) if six.PY2 else pickle.load(
                     f, encoding='latin1')
 
-        if not keep_name_table and "StructuredToParameterName@@" in para_dict:
+        if not configs.keep_name_table and "StructuredToParameterName@@" in para_dict:
             del para_dict["StructuredToParameterName@@"]
 
         if os.path.exists(opti_file_path):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index 4630cfcdabfd307ea03a7fd0c885c73ce4a4ea0b..c837c8eb123c2707d89a75a7489607f43a2e7501 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -31,6 +31,7 @@ from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticLayer
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import unwrap_decorators
 from paddle.fluid.dygraph.layers import Layer
 
 # TODO(liym27): A better way to do this.
@@ -118,14 +119,9 @@ def convert_call(func):
     func_self = None
     converted_call = None
 
-    # Function in convert_call may be decorated by another `@declarative`,
+    # Function in convert_call may be decorated by another `@to_static`,
     # in this case, unwraps it into a raw method or function.
-    if isinstance(func, StaticLayer):
-        instance = func._class_instance
-        if instance is not None:
-            func = func.dygraph_function.__get__(instance)
-        else:
-            func = func.dygraph_function
+    _, func = unwrap_decorators(func)
 
     if is_builtin_len(func):
         return convert_len
@@ -155,7 +151,8 @@ def convert_call(func):
                 if inspect.isfunction(fn):
                     global_functions.add(fn)
                 elif isinstance(fn, StaticLayer):
-                    global_functions.add(fn.dygraph_function)
+                    _, fn = unwrap_decorators(fn)
+                    global_functions.add(fn)
 
             if func in global_functions:
                 converted_call = convert_to_static(func)
@@ -189,7 +186,8 @@ def convert_call(func):
     elif hasattr(func, '__class__') and hasattr(func.__class__, '__call__'):
         if hasattr(func, 'forward') and isinstance(func, Layer):
             try:
-                forward_func = convert_to_static(func.forward)
+                _, forward_func = unwrap_decorators(func.forward)
+                forward_func = convert_to_static(forward_func)
                 setattr(func, 'forward', forward_func)
                 func_self = func
             except Exception:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index cb489af44d0adc7da377f73a3205c3c264769b4d..3d27810f1db94c4f6c273399ec93b9335f5bb03a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -21,6 +21,7 @@ import six
 import textwrap
 import threading
 import warnings
+import weakref
 
 import gast
 from paddle.fluid import framework
@@ -245,6 +246,7 @@ class StaticLayer(object):
         self._input_spec = input_spec
         self._function_spec = FunctionSpec(function, input_spec)
         self._program_cache = ProgramCache()
+        self._descriptor_cache = weakref.WeakKeyDictionary()
         # Note: Hold a reference to ProgramTranslator for switching `enable_declarative`.
         self._program_trans = ProgramTranslator()
 
@@ -271,8 +273,19 @@ class StaticLayer(object):
         of `Net` instance. After decorated by `@paddle.jit.to_static`, it will firstly to call `__get__`
         to parse the class instance correctly instead of the `StaticLayer` instance.
         """
-        self._class_instance = instance
-        return self
+        if instance not in self._descriptor_cache:
+            if instance is None:
+                return self
+            # Note(Aurelius84): To construct new instance of StaticLayer when we
+            # first encouter the bound function of layer and cache it.
+            new_static_layer = self._clone()
+            new_static_layer._class_instance = instance
+            self._descriptor_cache[instance] = new_static_layer
+
+        return self._descriptor_cache[instance]
+
+    def _clone(self):
+        return self.__class__(self._dygraph_function, self._input_spec)
 
     def __call__(self, *args, **kwargs):
         """
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 7f3d450a49c7d3fcc9ca1d3c2d7c5eb732671c6c..1d2ea142c7d5f2e653e446986a39d1bc155006f0 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -488,6 +488,15 @@ def _load_persistable_vars(model_path,
     return load_var_dict
 
 
+# NOTE(chenweihang): to adapt paddle.load to get state_dict
+def _remove_varname_suffix(var_dict, program_holder):
+    no_suffix_var_dict = dict()
+    for var_name in var_dict:
+        no_suffix_name = program_holder._suffix_varname_dict[var_name]
+        no_suffix_var_dict[no_suffix_name] = var_dict[var_name]
+    return no_suffix_var_dict
+
+
 def _construct_program_holders(model_path, model_filename=None):
     # make sure the path has been checked
     program_holder_dict = dict()
@@ -517,7 +526,8 @@ def _construct_program_holders(model_path, model_filename=None):
 def _construct_params_and_buffers(model_path,
                                   programs,
                                   separate_params=False,
-                                  params_filename=None):
+                                  params_filename=None,
+                                  append_suffix=True):
     var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
     if os.path.exists(var_info_path):
         var_dict = _load_persistable_vars(model_path, var_info_path,
@@ -526,6 +536,10 @@ def _construct_params_and_buffers(model_path,
     else:
         var_dict = _load_persistable_vars_by_program(
             model_path, programs['forward'], params_filename)
+
+    if not append_suffix:
+        var_dict = _remove_varname_suffix(var_dict, programs['forward'])
+
     return var_dict
 
 
@@ -542,89 +556,92 @@ class TranslatedLayer(layers.Layer):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
 
-                return __reader__
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
+
+                def __len__(self):
+                    return self.num_samples
 
-            class LinearNet(fluid.dygraph.Layer):
-                def __init__(self, in_size, out_size):
+            class LinearNet(nn.Layer):
+                def __init__(self):
                     super(LinearNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     return self._linear(x)
 
+            def train(layer, loader, loss_fn, opt):
+                for epoch_id in range(EPOCH_NUM):
+                    for batch_id, (image, label) in enumerate(loader()):
+                        out = layer(image)
+                        loss = loss_fn(out, label)
+                        loss.backward()
+                        opt.step()
+                        opt.clear_grad()
+                        print("Epoch {} batch {}: loss = {}".format(
+                            epoch_id, batch_id, np.mean(loss.numpy())))
+
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            place = paddle.CPUPlace()
+            paddle.disable_static(place) 
 
             # 1. train & save model.
-            # create network
-            net = LinearNet(784, 1)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-            # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
-            # train
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
 
-                cost = net(img)
+            # create network
+            layer = LinearNet()
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
+            # create data loader
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
 
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                net.clear_gradients()
+            # train
+            train(layer, loader, loss_fn, adam)
 
+            # save
             model_path = "linear.example.model"
-            fluid.dygraph.jit.save(
-                layer=net,
-                model_path=model_path,
-                input_spec=[img])
+            paddle.jit.save(layer, model_path)
 
             # 2. load model as TranslatedLayer
-            translated_layer = fluid.dygraph.jit.load(model_path)
+
+            # load
+            translated_layer = paddle.jit.load(model_path)
+
             # inference
             translated_layer.eval()
-            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
+            x = paddle.randn([1, IMAGE_SIZE], 'float32')
             pred = translated_layer(x)
+
             # fine-tune
             translated_layer.train()
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=translated_layer.parameters())
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
-
-                cost = translated_layer(img)
+            adam = opt.Adam(learning_rate=0.001, parameters=translated_layer.parameters())
+            train(translated_layer, loader, loss_fn, adam)
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                translated_layer.clear_gradients()
     """
 
     def __init__(self, programs, persistable_vars):
@@ -685,7 +702,7 @@ class TranslatedLayer(layers.Layer):
         # 1. load program desc & construct _ProgramHolder
         programs = _construct_program_holders(model_path, model_filename)
 
-        # 2. load layer parameters & parameter attributes
+        # 2. load layer parameters & buffers
         persistable_vars = _construct_params_and_buffers(
             model_path, programs, separate_params, params_filename)
 
@@ -800,3 +817,107 @@ class TranslatedLayer(layers.Layer):
 
     def eval(self):
         self._is_test = True
+
+    def program(self, method_name='forward'):
+        """
+        Gets translated program of specified method.
+
+        Args:
+            - method_name (string): mehtod name corresponding to the program
+                to be obtained. Default: 'forward'.
+        
+        Returns:
+            Program
+
+        Examples:
+            .. code-block:: python
+            
+                import numpy as np
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
+
+                BATCH_SIZE = 16
+                BATCH_NUM = 4
+                EPOCH_NUM = 4
+
+                IMAGE_SIZE = 784
+                CLASS_NUM = 10
+
+                # define a random dataset
+                class RandomDataset(paddle.io.Dataset):
+                    def __init__(self, num_samples):
+                        self.num_samples = num_samples
+
+                    def __getitem__(self, idx):
+                        image = np.random.random([IMAGE_SIZE]).astype('float32')
+                        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                        return image, label
+
+                    def __len__(self):
+                        return self.num_samples
+
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+                    @paddle.jit.to_static
+                    def forward(self, x):
+                        return self._linear(x)
+
+                def train(layer, loader, loss_fn, opt):
+                    for epoch_id in range(EPOCH_NUM):
+                        for batch_id, (image, label) in enumerate(loader()):
+                            out = layer(image)
+                            loss = loss_fn(out, label)
+                            loss.backward()
+                            opt.step()
+                            opt.clear_grad()
+                            print("Epoch {} batch {}: loss = {}".format(
+                                epoch_id, batch_id, np.mean(loss.numpy())))
+
+                # enable dygraph mode
+                place = paddle.CPUPlace()
+                paddle.disable_static(place) 
+
+                # create network
+                layer = LinearNet()
+                loss_fn = nn.CrossEntropyLoss()
+                adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
+
+                # create data loader
+                dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+                loader = paddle.io.DataLoader(dataset,
+                    places=place,
+                    batch_size=BATCH_SIZE,
+                    shuffle=True,
+                    drop_last=True,
+                    num_workers=2)
+
+                # train
+                train(layer, loader, loss_fn, adam)
+
+                # save
+                model_path = "linear.example.model"
+                paddle.jit.save(layer, model_path)
+
+                # load
+                translated_layer = paddle.jit.load(model_path)
+
+                # get program
+                program = translated_layer.program()
+        """
+        # 1. get program holder
+        program_holder = self._program_holder_dict.get(method_name, None)
+        if program_holder is None:
+            raise ValueError(
+                "The method `%s` is not exists in loaded TranslatedLayer." %
+                method_name)
+
+        # 2. get inference program desc
+        program_desc = program_holder.infer_program
+
+        # 3. construct program
+        program = _build_program_by_desc(program_desc)
+        return program
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index f67b79b91f7da235697d920cf0dfe376e88ab93e..d520fe61888cf3b11efc61d67ce566a3407dc6ff 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import os
 import pickle
 import warnings
+import functools
 
 import six
 import paddle
@@ -228,63 +229,60 @@ class SaveLoadConfig(object):
 
         .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            class SimpleNet(fluid.dygraph.Layer):
+            class SimpleNet(nn.Layer):
                 def __init__(self, in_size, out_size):
                     super(SimpleNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(in_size, out_size)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     y = self._linear(x)
                     z = self._linear(y)
                     return z
 
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            paddle.disable_static() 
 
             # train model
             net = SimpleNet(8, 8)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-            x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+            adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+            x = paddle.randn([4, 8], 'float32')
             for i in range(10):
                 out = net(x)
-                loss = fluid.layers.mean(out)
+                loss = paddle.tensor.mean(out)
                 loss.backward()
-                adam.minimize(loss)
-                net.clear_gradients()
+                adam.step()
+                adam.clear_grad()
 
             # use SaveLoadconfig when saving model
             model_path = "simplenet.example.model"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.model_filename = "__simplenet__"
-            fluid.dygraph.jit.save(
+            config = paddle.SaveLoadConfig()
+            config.model_filename = "__simplenet__"
+            paddle.jit.save(
                 layer=net,
                 model_path=model_path,
-                input_spec=[x],
-                configs=configs)
+                config=config)
 
         2. Using ``SaveLoadConfig`` when loading model
 
         .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
+            import paddle
 
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            paddle.disable_static() 
 
             # use SaveLoadconfig when loading model
             model_path = "simplenet.example.model"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.model_filename = "__simplenet__"
-            infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
+            config = paddle.SaveLoadConfig()
+            config.model_filename = "__simplenet__"
+            infer_net = paddle.jit.load(model_path, config=config)
             # inference
-            x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+            x = paddle.randn([4, 8], 'float32')
             pred = infer_net(x)
     """
 
@@ -293,6 +291,8 @@ class SaveLoadConfig(object):
         self._model_filename = None
         self._params_filename = None
         self._separate_params = False
+        # used for `paddle.load`
+        self._keep_name_table = False
 
         # NOTE: Users rarely use following configs, so these configs are not open to users,
         # reducing user learning costs, but we retain the configuration capabilities
@@ -322,51 +322,46 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
-                        loss = fluid.layers.mean(z)
+                        loss = paddle.tensor.mean(z)
                         return z, loss
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out, loss = net(x)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
+                    adam.step()
+                    adam.clear_grad()
 
                 # use SaveLoadconfig.output_spec
                 model_path = "simplenet.example.model.output_spec"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                # only keep the predicted output in saved model, discard loss
-                configs.output_spec = [out]
-
-                fluid.dygraph.jit.save(
+                config = paddle.SaveLoadConfig()
+                config.output_spec = [out]
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
+                    config=config)
 
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
-                # only have the predicted output
+                infer_net = paddle.jit.load(model_path)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._output_spec
@@ -393,52 +388,47 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
                         return z
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out = net(x)
-                    loss = fluid.layers.mean(out)
+                    loss = paddle.tensor.mean(out)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
-
-                model_path = "simplenet.example.model.model_filename"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                configs.model_filename = "__simplenet__"
+                    adam.step()
+                    adam.clear_grad()
 
                 # saving with configs.model_filename
-                fluid.dygraph.jit.save(
+                model_path = "simplenet.example.model.model_filename"
+                config = paddle.SaveLoadConfig()
+                config.model_filename = "__simplenet__"
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
-                # [result] the saved model directory contains:
-                # __simplenet__  __variables__  __variables.info__
+                    config=config)
 
                 # loading with configs.model_filename
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                infer_net = paddle.jit.load(model_path, config=config)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._model_filename
@@ -463,52 +453,48 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
                         return z
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out = net(x)
-                    loss = fluid.layers.mean(out)
+                    loss = paddle.tensor.mean(out)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
+                    adam.step()
+                    adam.clear_grad()
 
                 model_path = "simplenet.example.model.params_filename"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                configs.params_filename = "__params__"
+                config = paddle.SaveLoadConfig()
+                config.params_filename = "__params__"
 
                 # saving with configs.params_filename
-                fluid.dygraph.jit.save(
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
-                # [result] the saved model directory contains:
-                # __model__  __params__  __variables.info__
+                    config=config)
 
                 # loading with configs.params_filename
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                infer_net = paddle.jit.load(model_path, config=config)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._params_filename
@@ -542,52 +528,50 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
                         return z
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out = net(x)
-                    loss = fluid.layers.mean(out)
+                    loss = paddle.tensor.mean(out)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
+                    adam.step()
+                    adam.clear_grad()
 
                 model_path = "simplenet.example.model.separate_params"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                configs.separate_params = True
+                config = paddle.jit.SaveLoadConfig()
+                config.separate_params = True
 
                 # saving with configs.separate_params
-                fluid.dygraph.jit.save(
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
+                    config=config)
                 # [result] the saved model directory contains:
                 # linear_0.b_0  linear_0.w_0  __model__  __variables.info__
 
                 # loading with configs.params_filename
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                infer_net = paddle.jit.load(model_path, config=config)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._separate_params
@@ -600,9 +584,70 @@ class SaveLoadConfig(object):
                 % type(value))
         self._separate_params = value
 
+    @property
+    def keep_name_table(self):
+        """
+        Configures whether keep ``structured_name -> parameter_name`` dict in loaded state dict.
+        This dict is the debugging information saved when call `paddle.save`. 
+        It is generally only used for debugging and does not affect the actual training or inference. 
+        By default, it will not be retained in `paddle.load` result. Default: False.
+        
+        .. note::
+            Only used for ``paddle.load``.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+            
+                paddle.disable_static()
+
+                linear = paddle.nn.Linear(5, 1)
+
+                state_dict = linear.state_dict()
+                paddle.save(state_dict, "paddle_dy")
+
+                configs = paddle.SaveLoadConfig()
+                configs.keep_name_table = True
+                para_state_dict, _ = paddle.load("paddle_dy", configs)
+
+                print(para_state_dict)
+                # the name_table is 'StructuredToParameterName@@'
+                # {'bias': array([0.], dtype=float32), 
+                #  'StructuredToParameterName@@': 
+                #     {'bias': u'linear_0.b_0', 'weight': u'linear_0.w_0'}, 
+                #  'weight': array([[ 0.04230034],
+                #     [-0.1222527 ],
+                #     [ 0.7392676 ],
+                #     [-0.8136974 ],
+                #     [ 0.01211023]], dtype=float32)}
+        """
+        return self._keep_name_table
+
+    @keep_name_table.setter
+    def keep_name_table(self, value):
+        if not isinstance(value, bool):
+            raise TypeError(
+                "The SaveLoadConfig.keep_name_table should be bool value, but received input's type is %s."
+                % type(value))
+        self._keep_name_table = value
 
+
+# NOTE(chenweihang): change jit.save/load argument `configs` to `config`
+def deprecate_save_load_configs(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if 'configs' in kwargs:
+            kwargs['config'] = kwargs['configs']
+            kwargs.pop('configs')
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+@deprecate_save_load_configs
 @switch_to_static_graph
-def save(layer, model_path, input_spec=None, configs=None):
+def save(layer, model_path, input_spec=None, config=None):
     """
     Saves input declarative Layer as :ref:`api_imperative_TranslatedLayer` 
     format model, which can be used for inference or fine-tuning after loading.
@@ -627,7 +672,7 @@ def save(layer, model_path, input_spec=None, configs=None):
             It is the example inputs that will be passed to saved TranslatedLayer's forward
             function. If None, all input variables of the original Layer's forward function
             would be the inputs of the saved model. Default None.
-        configs (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object
+        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object
             that specifies additional configuration options. Default None.
     Returns:
         None
@@ -636,65 +681,76 @@ def save(layer, model_path, input_spec=None, configs=None):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
 
-                return __reader__
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
 
-            class LinearNet(fluid.dygraph.Layer):
-                def __init__(self, in_size, out_size):
+                def __len__(self):
+                    return self.num_samples
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
                     super(LinearNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     return self._linear(x)
 
+            def train(layer, loader, loss_fn, opt):
+                for epoch_id in range(EPOCH_NUM):
+                    for batch_id, (image, label) in enumerate(loader()):
+                        out = layer(image)
+                        loss = loss_fn(out, label)
+                        loss.backward()
+                        opt.step()
+                        opt.clear_grad()
+                        print("Epoch {} batch {}: loss = {}".format(
+                            epoch_id, batch_id, np.mean(loss.numpy())))
+
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            place = paddle.CPUPlace()
+            paddle.disable_static(place) 
 
-            # create network
-            net = LinearNet(784, 1)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-            # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
-            # train
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
+            # 1. train & save model.
 
-                cost = net(img)
+            # create network
+            layer = LinearNet()
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
+            # create data loader
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
 
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                net.clear_gradients()
+            # train
+            train(layer, loader, loss_fn, adam)
 
-            # save model
+            # save
             model_path = "linear.example.model"
-            fluid.dygraph.jit.save(
-                layer=net,
-                model_path=model_path,
-                input_spec=[img])
+            paddle.jit.save(layer, model_path)
     """
 
     def get_inout_spec(all_vars, target_vars, return_name=False):
@@ -728,6 +784,7 @@ def save(layer, model_path, input_spec=None, configs=None):
             "The input layer of paddle.jit.save should be 'Layer', but received layer type is %s."
             % type(layer))
 
+    configs = config
     if configs is None:
         configs = SaveLoadConfig()
 
@@ -819,8 +876,9 @@ def save(layer, model_path, input_spec=None, configs=None):
             pickle.dump(extra_var_info, f, protocol=2)
 
 
+@deprecate_save_load_configs
 @dygraph_only
-def load(model_path, configs=None):
+def load(model_path, config=None):
     """
     :api_attr: imperative
 
@@ -837,7 +895,7 @@ def load(model_path, configs=None):
 
     Args:
         model_path (str): The directory path where the model is saved.
-        configs (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object that specifies 
+        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object that specifies 
             additional configuration options. Default None.
 
     Returns:
@@ -849,122 +907,126 @@ def load(model_path, configs=None):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
 
-                return __reader__
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
 
-            class LinearNet(fluid.dygraph.Layer):
-                def __init__(self, in_size, out_size):
+                def __len__(self):
+                    return self.num_samples
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
                     super(LinearNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     return self._linear(x)
 
+            def train(layer, loader, loss_fn, opt):
+                for epoch_id in range(EPOCH_NUM):
+                    for batch_id, (image, label) in enumerate(loader()):
+                        out = layer(image)
+                        loss = loss_fn(out, label)
+                        loss.backward()
+                        opt.step()
+                        opt.clear_grad()
+                        print("Epoch {} batch {}: loss = {}".format(
+                            epoch_id, batch_id, np.mean(loss.numpy())))
+
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            place = paddle.CPUPlace()
+            paddle.disable_static(place) 
 
             # 1. train & save model.
+
             # create network
-            net = LinearNet(784, 1)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
+            layer = LinearNet()
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
+
             # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
+
             # train
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
+            train(layer, loader, loss_fn, adam)
 
-                cost = net(img)
+            # save
+            model_path = "linear.example.model"
+            paddle.jit.save(layer, model_path)
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
+            # 2. load model
 
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                net.clear_gradients()
+            # load
+            loaded_layer = paddle.jit.load(model_path)
 
-            model_path = "linear.example.model"
-            fluid.dygraph.jit.save(
-                layer=net,
-                model_path=model_path,
-                input_spec=[img])
-
-            # 2. load model & inference
-            # load model
-            infer_net = fluid.dygraph.jit.load(model_path)
             # inference
-            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
-            pred = infer_net(x)
+            loaded_layer.eval()
+            x = paddle.randn([1, IMAGE_SIZE], 'float32')
+            pred = loaded_layer(x)
 
-            # 3. load model & fine-tune
-            # load model
-            train_net = fluid.dygraph.jit.load(model_path)
-            train_net.train()
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=train_net.parameters())
-            # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
             # fine-tune
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
-
-                cost = train_net(img)
+            loaded_layer.train()
+            adam = opt.Adam(learning_rate=0.001, parameters=loaded_layer.parameters())
+            train(loaded_layer, loader, loss_fn, adam)
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                train_net.clear_gradients()
 
         2. Load model saved by :ref:`api_fluid_io_save_inference_model` then performing and fine-tune training.
 
         .. code-block:: python
 
             import numpy as np
+            import paddle
             import paddle.fluid as fluid
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
 
-                return __reader__
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
 
-            img = fluid.data(name='img', shape=[None, 784], dtype='float32')
+                def __len__(self):
+                    return self.num_samples
+
+            image = fluid.data(name='image', shape=[None, 784], dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            pred = fluid.layers.fc(input=img, size=10, act='softmax')
+            pred = fluid.layers.fc(input=image, size=10, act='softmax')
             loss = fluid.layers.cross_entropy(input=pred, label=label)
             avg_loss = fluid.layers.mean(loss)
 
@@ -975,9 +1037,15 @@ def load(model_path, configs=None):
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
 
-            loader = fluid.io.DataLoader.from_generator(
-                feed_list=[img, label], capacity=5, iterable=True)
-            loader.set_batch_generator(random_batch_reader(), places=place)
+            # create data loader
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                feed_list=[image, label],
+                places=place,
+                batch_size=BATCH_SIZE, 
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
 
             # 1. train and save inference model
             for data in loader():
@@ -988,39 +1056,42 @@ def load(model_path, configs=None):
 
             model_path = "fc.example.model"
             fluid.io.save_inference_model(
-                model_path, ["img"], [pred], exe)
+                model_path, ["image"], [pred], exe)
+
+            # 2. load model
 
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            paddle.disable_static(place)
+
+            # load
+            fc = paddle.jit.load(model_path)
 
-            # 2. load model & inference
-            fc = fluid.dygraph.jit.load(model_path)
-            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
+            # inference
+            fc.eval()
+            x = paddle.randn([1, IMAGE_SIZE], 'float32')
             pred = fc(x)
 
-            # 3. load model & fine-tune
-            fc = fluid.dygraph.jit.load(model_path)
+            # fine-tune
             fc.train()
-            sgd = fluid.optimizer.SGD(learning_rate=0.001,
-                                        parameter_list=fc.parameters())
-
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(
-                random_batch_reader(), places=place)
-
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
-
-                cost = fc(img)
-
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-
-                avg_loss.backward()
-                sgd.minimize(avg_loss)
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=fc.parameters())
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
+            for epoch_id in range(EPOCH_NUM):
+                for batch_id, (image, label) in enumerate(loader()):
+                    out = fc(image)
+                    loss = loss_fn(out, label)
+                    loss.backward()
+                    adam.step()
+                    adam.clear_grad()
+                    print("Epoch {} batch {}: loss = {}".format(
+                        epoch_id, batch_id, np.mean(loss.numpy())))
     """
-    return TranslatedLayer._construct(model_path, configs)
+    return TranslatedLayer._construct(model_path, config)
 
 
 @dygraph_only
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 1ef719b9da187be659d9c898ec996b5ad0c0d8a6..7075024369f328b59ecac014b0960fc26f447ff2 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -29,6 +29,9 @@ from .layer_object_helper import LayerObjectHelper
 from .base import program_desc_tracing_guard, param_guard
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
+from paddle.fluid.executor import Executor, global_scope
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import _current_expected_place as _get_device
 
 __all__ = ['Layer']
 
@@ -797,7 +800,7 @@ class Layer(core.Layer):
                 raise ValueError(
                     "super(YourLayer, self).__init__() should be called first")
             if len(self._loaddict_holder) > 0:
-                assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in stat_dict".format(
+                assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
                     value.name)
 
                 value.set_value(self._loaddict_holder[value.name])
@@ -943,12 +946,13 @@ class Layer(core.Layer):
                     destination = destination_temp
         return destination
 
-    def set_dict(self,
-                 stat_dict,
-                 include_sublayers=True,
-                 use_structured_name=True):
+    @framework.deprecate_stat_dict
+    def set_state_dict(self,
+                       state_dict,
+                       include_sublayers=True,
+                       use_structured_name=True):
         '''
-        Set parameters and persistable buffers from stat_dict. All the parameters and buffers will be reset by the tensor in the stat_dict
+        Set parameters and persistable buffers from state_dict. All the parameters and buffers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters and persistable buffers.
@@ -961,72 +965,67 @@ class Layer(core.Layer):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
+                import paddle
+                
+                paddle.disable_static()
+                
+                emb = paddle.nn.Embedding([10, 10])
 
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-
-                    emb.set_dict( para_state_dict )
+                state_dict = emb.state_dict()
+                paddle.save(state_dict, "paddle_dy")
+                
+                para_state_dict, _ = paddle.load("paddle_dy")
 
-        '''
-        self.load_dict(
-            stat_dict,
-            include_sublayers=include_sublayers,
-            use_structured_name=use_structured_name)
+                emb.set_state_dict(para_state_dict)
 
-    def load_dict(self,
-                  stat_dict,
-                  include_sublayers=True,
-                  use_structured_name=True):
         '''
-        Set parameters and persistable buffers from stat_dict. All the parameters and persistabl buffers will be reset by the tensor in the stat_dict
 
-        This api will be Deprecated. Please use set_dict
-
-        Parameters:
-            state_dict(dict) : Dict contains all the parameters and persistable buffers.
-            include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
-            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key.
-                                                  Default: True
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
-
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-
-                    emb.load_dict( para_state_dict )
-
-        '''
-
-        inner_state_dict = self.state_dict()
+        def _check_match(key, param):
+            state = state_dict.get(key, None)
+            if state is None:
+                raise ValueError("{} is not found in the provided dict.".format(
+                    key))
+            if list(state.shape) != list(param.shape):
+                raise ValueError(
+                    "{} receives a shape {}, but the expected shape is {}.".
+                    format(key, list(state.shape), list(param.shape)))
+            return param, state
+
+        matched_param_state = []
+        for key, param in self.state_dict().items():
+            key_name = key if use_structured_name else param.name
+            try:
+                match_res = _check_match(key_name, param)
+                matched_param_state.append(match_res)
+            except ValueError as err:
+                warnings.warn(("Skip loading for {}. ".format(key) + str(err)))
+
+        if in_dygraph_mode():
+            for param, state in matched_param_state:
+                param.set_value(state)
+        else:
 
-        for name, param_or_buffer in inner_state_dict.items():
-            key_name = name if use_structured_name else param_or_buffer.name
-            if key_name in stat_dict:
-                param_or_buffer.set_value(stat_dict[key_name])
-            else:
-                raise RuntimeError(
-                    "Parameter or persistable buffer not found, Can't find [ {} ] in stat_dict"
-                    "use_structured_name is set to [{}]".format(
-                        key_name, use_structured_name))
-        unused_para_list = []
-        for k, v in stat_dict.items():
-            if k not in inner_state_dict:
-                unused_para_list.append(k)
-        if len(unused_para_list) > 0:
-            warnings.warn(
-                "Variables [ {} ] are not used, because not included in layers state_dict".
-                format(" ".join(unused_para_list)))
+            def _set_var(var, ndarray):
+                t = global_scope().find_var(var.name).get_tensor()
+                p = t._place()
+                if p.is_cpu_place():
+                    place = core.CPUPlace()
+                elif p.is_cuda_pinned_place():
+                    place = core.CUDAPinnedPlace()
+                else:
+                    p = core.Place()
+                    p.set_place(t._place())
+                    place = core.CUDAPlace(p.gpu_device_id())
+                t.set(ndarray, place)
+
+            executor = Executor(_get_device())._default_executor
+            # restore parameter states
+            core._create_loaded_parameter(
+                [param for param, state in matched_param_state],
+                global_scope(), executor)
+            for param, state in matched_param_state:
+                _set_var(param, state)
+
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+    load_dict = set_state_dict
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index cce383be7e22cd066199f814db80a75367862b82..cd6af6fd5b575e8188088bde9e8944ab94c7e0f8 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -97,7 +97,7 @@ class LearningRateDecay(object):
         """
         self.keys = ['step_num']
 
-    def set_dict(self, state_dict):
+    def set_state_dict(self, state_dict):
         """
         Loads the schedulers state.
         """
@@ -114,6 +114,9 @@ class LearningRateDecay(object):
                 "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
             )
 
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+
     def step(self):
         raise NotImplementedError()
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 5ecc713ddcace7a6bed05ffa4282d9f5c1041a44..472022bced7e3e2dd11d301501ebaec75e5e412a 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -587,12 +587,13 @@ class DataParallel(layers.Layer):
             include_sublayers=include_sublayers,
             structured_name_prefix=structured_name_prefix)
 
-    def set_dict(self,
-                 stat_dict,
-                 include_sublayers=True,
-                 use_structured_name=True):
+    @framework.deprecate_stat_dict
+    def set_state_dict(self,
+                       state_dict,
+                       include_sublayers=True,
+                       use_structured_name=True):
         '''
-        Set parameters of self._layers from stat_dict. All the parameters of self._layers will be reset by the tensor in the stat_dict
+        Set parameters of self._layers from state_dict. All the parameters of self._layers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters
@@ -605,62 +606,27 @@ class DataParallel(layers.Layer):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    strategy=fluid.dygraph.prepare_context()
-                    emb = fluid.dygraph.Embedding([10, 10])
-                    emb = fluid.dygraph.DataParallel(emb, strategy)
-
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-
-                    emb.set_dict( para_state_dict )
+                import paddle   
 
-        '''
-
-        self._layers.set_dict(
-            stat_dict,
-            include_sublayers=include_sublayers,
-            use_structured_name=use_structured_name)
-
-    def load_dict(self,
-                  stat_dict,
-                  include_sublayers=True,
-                  use_structured_name=True):
-        '''
-        Set parameters of self._layers from stat_dict. All the parameters of self._layers will be reset by the tensor in the stat_dict
-
-        This api will be Deprecated. Please use set_dict
-
-        Parameters:
-            state_dict(dict) : Dict contains all the parameters
-            include_sublayers(bool, optional) : If true, also include the parameters from sublayers. Default: True
-            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter name as key.
-                                                  Default: True
-        Returns:
-            None
+                paddle.disable_static()
 
-        Examples:
-            .. code-block:: python
+                emb = paddle.nn.Embedding([10, 10])
+                emb = fluid.dygraph.DataParallel(emb, strategy)
 
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    strategy=fluid.dygraph.prepare_context()
-                    emb = fluid.dygraph.Embedding([10, 10])
-                    emb = fluid.dygraph.DataParallel(emb, strategy)
+                state_dict = emb.state_dict()
+                paddle.save(state_dict, "paddle_dy")
 
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
+                para_state_dict, _ = paddle.load("paddle_dy")
 
-                    emb.load_dict( para_state_dict )
+                emb.set_state_dict(para_state_dict)
 
         '''
 
-        self._layers.load_dict(
-            stat_dict,
+        self._layers.set_state_dict(
+            state_dict,
             include_sublayers=include_sublayers,
             use_structured_name=use_structured_name)
+
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+    load_dict = set_state_dict
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index fc4e91aad4fff1db325e17828d26ccd94c164c3d..5281df9ead10acea5ae8656dcc4a0eed14fb3e83 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -36,6 +36,7 @@ from . import core
 from . import unique_name
 import paddle.version as fluid_version
 import warnings
+import functools
 
 __all__ = [
     'Program',
@@ -238,6 +239,25 @@ def _fake_interface_only_(func):
     return __impl__
 
 
+# NOTE(chenweihang): There is argument name typo (stat_dict, correct name is state_dict) 
+# in fluid api Layer.set_dict, Optimizer.load, in order to correct the argument without 
+# introducing compatibility issues, add this decorator
+# NOTE(chenweihang): not using `wrap_decorator` here is because `wrap_decorator` will
+# move kwargs to args, which doesn't work in this decorate case
+def deprecate_stat_dict(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if 'stat_dict' in kwargs:
+            warnings.warn(
+                "The argument `stat_dict` has deprecated, please change it to `state_dict`.",
+                DeprecationWarning)
+            kwargs['state_dict'] = kwargs['stat_dict']
+            kwargs.pop('stat_dict')
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
 dygraph_not_support = wrap_decorator(_dygraph_not_support_)
 dygraph_only = wrap_decorator(_dygraph_only_)
 fake_interface_only = wrap_decorator(_fake_interface_only_)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 5df5c55111f81463093cd40503466aa89841176c..3f826da3ae2beca51b639a69da4113e6d9580d6c 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -425,7 +426,23 @@ def find_heter_ops(program, default_device="cpu"):
 
 def create_heter_program(program, config, heter_program, heter_ops,
                          block_var_detail, current_device):
-    # add heter op
+
+    # This function mainly includes the following contents:
+    # 1. For every heter block:
+    #     a) copy heter device op from origin program
+    #     b) create variables which belong to heter op：
+    #         -> if variable is persistable, clone it in global_scope
+    #         -> if variable is temp, create it in heter block
+    #     c) create communicate related op as follow:
+    #         joint_var.0_1 -> slice -> reshape -> origin_var
+    #         origin_var -> origin_program
+    #         reshape -> concat -> joint_var.1_2
+    #     d) copy send op from origin program for var@grad which loacted in current heter block
+    #     e) re-check every op in current blcok if its device is not current heter devie
+    # 2. Create send op for step counter in last heter-block
+    # 3. Create Listen&Serv OP for distributed training
+    # 4. update CompileTimeStrategy for heter_program
+
     optimizer_block = []
     grad_to_block_id = []
     send_grad_var_list = []
@@ -437,17 +454,10 @@ def create_heter_program(program, config, heter_program, heter_ops,
         for _, op in enumerate(heter_block_ops):
             block_append_op(heter_program, program, heter_block, op)
 
-            # add relate variables
-            inputs = _get_input_map_from_op(program.global_block().vars, op)
-            add_vars_by_op_map(inputs, heter_program)
-
-            outputs = _get_output_map_from_op(program.global_block().vars, op)
-            add_vars_by_op_map(outputs, heter_program)
-
         entrance_vars = block_var_detail[index]["entrance"]
-        add_vars_by_var_list(entrance_vars, program, heter_program)
+        add_vars_by_var_list(entrance_vars, program, heter_program, heter_block)
         exit_vars = block_var_detail[index]["exit"]
-        add_vars_by_var_list(exit_vars, program, heter_program)
+        add_vars_by_var_list(exit_vars, program, heter_program, heter_block)
 
         comm_info = get_communicate_var_info(program, index, entrance_vars,
                                              exit_vars)
@@ -455,13 +465,13 @@ def create_heter_program(program, config, heter_program, heter_ops,
         grad_to_block_id.append(comm_info["block_input_var_name"] + ":" + str(
             heter_block.idx))
 
-        # create slice op
         first_op_index = 0
 
         get_type_var_name = comm_info["input_var_reshape_name"][0].split(
             ".input_reshape@Heter")[0]
-        get_type_var = heter_program.global_block().vars[get_type_var_name]
+        get_type_var = heter_block.vars[get_type_var_name]
 
+        # create slice op
         insert_recv_slice_op(
             heter_program, heter_block, first_op_index,
             comm_info["block_input_var_name"],
@@ -471,6 +481,13 @@ def create_heter_program(program, config, heter_program, heter_ops,
                 for i in range(len(comm_info["input_var_reshape_dim"]))
             ])
         first_op_index += len(comm_info["input_var_reshape_dim"])
+
+        heter_program.global_block().create_var(
+            name=comm_info["block_input_var_name"],
+            shape=(-1, sum(comm_info["input_var_reshape_dim"])),
+            dtype=get_type_var.dtype,
+            type=get_type_var.type)
+
         # create reshape op
         for i in range(len(comm_info["input_var_reshape_name"])):
             var_name = entrance_vars[i]
@@ -498,13 +515,14 @@ def create_heter_program(program, config, heter_program, heter_ops,
                               comm_info["block_output_var_name"],
                               [-1, sum(comm_info["output_var_reshape_dim"])])
         check_op_device(heter_block, current_device)
+
+        # add send op
         send_grad_var_list = send_grad_var_list + add_heter_send_op(
             program, heter_program, heter_block, block_var_detail[index])
 
     # add step conter
     send_input_vars = []
     dummy_output = []
-    trainer_id = config.get_role_id()
     pserver_endpoints = config.get_ps_endpoints()
     optimizer_block[-1].append_op(
         type="send",
@@ -539,7 +557,6 @@ def create_heter_program(program, config, heter_program, heter_ops,
     # append the listen_and_serv op
     heter_program.global_block().append_op(
         type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs)
-
     check_heter_compile_time_strategy(program, config, send_grad_var_list)
 
 
@@ -558,6 +575,16 @@ def check_heter_compile_time_strategy(program, config, send_grad_var_list):
 
 
 def create_trainer_program(program, config, heter_ops, block_var_detail):
+    # This function mainly includes the following contents:
+    # 1. For every heter block in origin program
+    #     a) delete heter op and related variables
+    #     b) add send&recv op
+    #     c) add communicate ops as follows:
+    #         origin_var -> reshape -> concat -> joint_var.0_1
+    #         send&recv op(send joint_var.0_1; recv joint_var.1_2)
+    #         joint_var.1_2 -> slice -> reshape -> origin_var
+    #     d) remove send op which related var@grad is not in trainer program
+    # 2. check every op's device
     for device in heter_ops.keys():
         for heter_block_index in sorted(heter_ops[device]):
             replace_ops_by_communicate_op(program, config, heter_block_index,
@@ -916,19 +943,19 @@ def insert_reshape_op(program,
                       var_name,
                       new_var_name,
                       new_var_shape=None):
-    input_var = program.global_block().vars[var_name]
+    input_var = block.vars[var_name]
 
-    if new_var_name not in program.global_block().vars:
-        out = program.global_block().create_var(
+    if new_var_name not in block.vars:
+        out = block.create_var(
             name=new_var_name,
             shape=new_var_shape,
             dtype=input_var.dtype,
             type=input_var.type)
     else:
-        out = program.global_block().vars[new_var_name]
+        out = block.vars[new_var_name]
         new_var_shape = out.shape
 
-    x_shape = program.global_block().create_var(
+    x_shape = block.create_var(
         name="{}.xshape@Heter".format(var_name), dtype=input_var.dtype)
     block._insert_op(
         index=index,
@@ -941,9 +968,7 @@ def insert_reshape_op(program,
 
 def insert_send_concat_op(program, block, index, var_name_list, new_var_name,
                           new_var_shape):
-    input_var_list = [
-        program.global_block().vars[var_name] for var_name in var_name_list
-    ]
+    input_var_list = [block.vars[var_name] for var_name in var_name_list]
 
     out = program.global_block().create_var(
         name=new_var_name,
@@ -971,14 +996,14 @@ def insert_recv_slice_op(program, block, index, var_name, var_shape, dtype,
 
     out_list = []
     for i in range(len(new_var_name_list)):
-        if new_var_name_list[i] not in program.global_block().vars:
-            out = program.global_block().create_var(
+        if new_var_name_list[i] not in block.vars:
+            out = block.create_var(
                 name=new_var_name_list[i],
                 shape=new_var_shape_list[i],
                 dtype=input_var.dtype,
                 type=input_var.type)
         else:
-            out = program.global_block().vars[new_var_name_list[i]]
+            out = block.vars[new_var_name_list[i]]
         out_list.append(out)
 
     start_index = 0
@@ -1021,21 +1046,33 @@ def deleter_trainer_useless_var(program):
 
 
 def block_append_op(program, origin_program, block, op):
-    inputs = _get_input_map_from_op(origin_program.global_block().vars, op)
+    merge_ordereddict = origin_program.global_block().vars.copy()
+    merge_ordereddict.update(block.vars)
+    inputs = _get_input_map_from_op(merge_ordereddict, op)
     for key, varlist in six.iteritems(inputs):
         if not isinstance(varlist, list):
             varlist = [varlist]
         for var in varlist:
-            if var.name not in program.global_block().vars:
-                program.global_block()._clone_variable(var)
+            if var.name not in program.global_block(
+            ).vars and var.name not in block.vars:
+                if var.persistable:
+                    program.global_block()._clone_variable(
+                        var, force_persistable=False)
+                else:
+                    block._clone_variable(var, force_persistable=False)
 
     outputs = _get_output_map_from_op(origin_program.global_block().vars, op)
     for key, varlist in six.iteritems(outputs):
         if not isinstance(varlist, list):
             varlist = [varlist]
         for var in varlist:
-            if var.name not in program.global_block().vars:
-                program.global_block()._clone_variable(var)
+            if var.name not in program.global_block(
+            ).vars and var.name not in block.vars:
+                if var.persistable:
+                    program.global_block()._clone_variable(
+                        var, force_persistable=False)
+                else:
+                    block._clone_variable(var, force_persistable=False)
 
     if "_grad" not in op.type:
         # for forward op
@@ -1060,21 +1097,15 @@ def block_append_op(program, origin_program, block, op):
         block._sync_with_cpp()
 
 
-def add_vars_by_op_map(var_map, program):
-    for key, varlist in six.iteritems(var_map):
-        if not isinstance(varlist, list):
-            varlist = [varlist]
-        for i in range(len(varlist)):
-            var = varlist[i]
-            if var.name not in program.global_block().vars:
-                program.global_block()._clone_variable(var)
-
-
-def add_vars_by_var_list(var_name_list, origin_program, program):
+def add_vars_by_var_list(var_name_list, origin_program, program, block):
     for var_name in var_name_list:
         if var_name not in program.global_block().vars:
             var = origin_program.global_block().vars[var_name]
-            program.global_block()._clone_variable(var)
+            if var.persistable:
+                program.global_block()._clone_variable(
+                    var, force_persistable=False)
+            else:
+                block._clone_variable(var, force_persistable=False)
 
 
 def get_varlist_from_op_map(var_map):
diff --git a/python/paddle/fluid/inference/__init__.py b/python/paddle/fluid/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3013c1f2aff87fb293ea984c99d8336b418ee080
--- /dev/null
+++ b/python/paddle/fluid/inference/__init__.py
@@ -0,0 +1,17 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .wrapper import Config, DataType, PlaceType, PrecisionType, Tensor, Predictor
+
+from ..core import create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool
diff --git a/python/paddle/fluid/inference/wrapper.py b/python/paddle/fluid/inference/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..96885edcc5e822beb5db8332f2b58d12b9c4ff63
--- /dev/null
+++ b/python/paddle/fluid/inference/wrapper.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core import AnalysisConfig, PaddleDType, PaddlePlace
+from ..core import PaddleInferPredictor, PaddleInferTensor
+
+DataType = PaddleDType
+PlaceType = PaddlePlace
+PrecisionType = AnalysisConfig.Precision
+Config = AnalysisConfig
+Tensor = PaddleInferTensor
+Predictor = PaddleInferPredictor
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index a264f9a70fe7aa97f2b45797557677e00c45a1f1..70f48e82fdf6564698098cbdf6e1290f7fba0e18 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4815,11 +4815,6 @@ def split(input, num_or_sections, dim=-1, name=None):
     Returns:
         list(Tensor): The list of segmented Tensors.
 
-    Raises:
-        TypeError: The data type of ``input`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: ``num_or_sections`` is not int, list or tuple.
-        TypeError: ``dim`` is not int or Tensor. The data type of ``dim`` must be int32 or int64 when it's a Tensor.
-
     Example:
         .. code-block:: python
 
@@ -6103,11 +6098,6 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     Returns:
         Tensor: A reshaped Tensor with the same data type as ``x``. It is a new tensor variable if ``inplace`` is ``False``, otherwise it is ``x``. If ``act`` is None, return the reshaped tensor variable, otherwise return the activated tensor variable.
 
-    Raises:
-        TypeError: If actual_shape is neither Tensor nor None.
-        ValueError: If more than one elements of ``shape`` is -1.
-        ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
-        ValueError: If the elements in ``shape`` is negative except -1.
 
     Examples:
         .. code-block:: python
@@ -6316,6 +6306,15 @@ def unsqueeze(input, axes, name=None):
 
     """
     if in_dygraph_mode():
+        if isinstance(axes, int):
+            axes = [axes]
+        elif isinstance(axes, Variable):
+            axes = [axes.numpy().item(0)]
+        elif isinstance(axes, (list, tuple)):
+            axes = [
+                item.numpy().item(0) if isinstance(item, Variable) else item
+                for item in axes
+            ]
         out, _ = core.ops.unsqueeze2(input, 'axes', axes)
         return out
 
@@ -8256,10 +8255,6 @@ def gather(input, index, overwrite=True):
     Returns:
         output (Tensor): The output is a tensor with the same rank as input.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
-
     Examples:
 
         .. code-block:: python
@@ -8349,10 +8344,6 @@ def gather_nd(input, index, name=None):
 
     Returns:
         output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
-    
-    Raises:
-        TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of float32, float64, int32 and int64.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
 
     Examples:
 
@@ -10018,15 +10009,16 @@ def stack(x, axis=0, name=None):
 
 
     Args:
-        x (Variable|list(Variable)): Input :code:`x` can be a single Tensor, a :code:`list` of Tensors.
-                                     If :code:`x` is a :code:`list`, the shapes of all these Tensors
+        x (list(Variable)|tuple(Variable)): Input :code:`x` can be a :code:`list` or :code:`tuple` of Tensors, the shapes of all these Tensors
                                      must be the same. Supposing input is N dims
                                      Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims
                                      Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
                                      Supported data types: float32, float64, int32, int64.
-        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`.
-                              R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`.
-                              The default value of axis is 0.
+        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
+                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
+                              If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
+        name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
+    
 
     Returns:
         Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`.
@@ -10044,18 +10036,27 @@ def stack(x, axis=0, name=None):
 
             data = layers.stack([x1,x2], axis=1) # stack according to axis 1, data.shape=[None, 2, 1, 2]
 
-            # stack single Tensor
-            data = layers.stack(x1)  # stack according to axis 0, data.shape=[1, None, 1, 2]
 
     """
     axis = 0 if axis is None else axis
-    if not isinstance(x, list) and not isinstance(x, tuple):
-        x = [x]
 
     if in_dygraph_mode():
         return core.ops.stack(x, 'axis', axis)
 
+    if not isinstance(x, list) and not isinstance(x, tuple):
+        # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc.
+        # In that case, Variable is array of tensors indeed.
+        if isinstance(x, Variable) and x.desc.type(
+        ) == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+            x = [x]
+        else:
+            raise TypeError("The type of '%s' in %s must be %s, but received %s"
+                            % ('x', 'stack',
+                               'list[Tensor], tuple[Tensor] or TensorArray',
+                               type(x)))
+
     helper = LayerHelper('stack', **locals())
+
     out = helper.create_variable_for_type_inference(x[0].dtype)
     if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
@@ -10600,7 +10601,7 @@ def gaussian_random(shape,
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.gaussian_random('shape', shape, 'mean',
                                         float(mean), 'std',
                                         float(std), 'seed', seed, 'dtype',
@@ -10617,7 +10618,7 @@ def gaussian_random(shape,
         'dtype': dtype,
         'use_mkldnn': False
     }
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs,
         attrs=attrs,
         shape=shape,
@@ -12165,13 +12166,10 @@ def logical_and(x, y, out=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True], dtype=np.bool)
-            y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.to_tensor(x_data)
-            y = paddle.to_tensor(y_data)
+            x = paddle.to_tensor([True])
+            y = paddle.to_tensor([True, False, True, False])
             res = paddle.logical_and(x, y)
             print(res.numpy()) # [True False True False]
     """
@@ -12284,11 +12282,9 @@ def logical_not(x, out=None, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.to_variable(x_data)
+            x = paddle.to_tensor([True, False, True, False])
             res = paddle.logical_not(x)
             print(res.numpy()) # [False  True False  True]
     """
@@ -15106,7 +15102,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.uniform_random('shape', shape, 'min',
                                        float(min), 'max',
                                        float(max), 'seed', seed, 'dtype', dtype)
@@ -15116,7 +15112,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand')
 
     helper = LayerHelper("uniform_random", **locals())
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 84cacea6ba5723f8a06fc87fa9c59d96f802e65a..1efae3ddf1f3422a53f69c4b5b8eeec6183fae96 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -86,13 +86,11 @@ add_sample_code(globals()["sigmoid"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.sigmoid(x)
         print(out.numpy())
         # [0.40131234 0.450166   0.52497919 0.57444252]
@@ -103,13 +101,11 @@ add_sample_code(globals()["logsigmoid"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.logsigmoid(x)
         print(out.numpy())
         # [-0.91301525 -0.79813887 -0.64439666 -0.55435524]
@@ -120,12 +116,10 @@ add_sample_code(globals()["exp"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.exp(x)
         print(out.numpy())
         # [0.67032005 0.81873075 1.10517092 1.34985881]
@@ -136,12 +130,10 @@ add_sample_code(globals()["tanh"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.tanh(x)
         print(out.numpy())
         # [-0.37994896 -0.19737532  0.09966799  0.29131261]
@@ -152,12 +144,10 @@ add_sample_code(globals()["atan"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.atan(x)
         print(out.numpy())
         # [-0.38050638 -0.19739556  0.09966865  0.29145679]
@@ -170,11 +160,10 @@ Examples:
 
         import paddle
         import paddle.nn.functional as F
-        import numpy as np
 
         paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
 
 """)
@@ -183,12 +172,10 @@ add_sample_code(globals()["sqrt"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([0.1, 0.2, 0.3, 0.4])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4])
         out = paddle.sqrt(x)
         print(out.numpy())
         # [0.31622777 0.4472136  0.54772256 0.63245553]
@@ -199,12 +186,10 @@ add_sample_code(globals()["rsqrt"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([0.1, 0.2, 0.3, 0.4])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4])
         out = paddle.rsqrt(x)
         print(out.numpy())
         # [3.16227766 2.23606798 1.82574186 1.58113883]
@@ -215,12 +200,10 @@ add_sample_code(globals()["abs"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.abs(x)
         print(out.numpy())
         # [0.4 0.2 0.1 0.3]
@@ -231,12 +214,10 @@ add_sample_code(globals()["ceil"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.ceil(x)
         print(out.numpy())
         # [-0. -0.  1.  1.]
@@ -247,12 +228,10 @@ add_sample_code(globals()["floor"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.floor(x)
         print(out.numpy())
         # [-1. -1.  0.  0.]
@@ -263,12 +242,10 @@ add_sample_code(globals()["cos"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.cos(x)
         print(out.numpy())
         # [0.92106099 0.98006658 0.99500417 0.95533649]
@@ -279,12 +256,10 @@ add_sample_code(globals()["acos"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.acos(x)
         print(out.numpy())
         # [1.98231317 1.77215425 1.47062891 1.26610367]
@@ -295,12 +270,10 @@ add_sample_code(globals()["sin"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.sin(x)
         print(out.numpy())
         # [-0.38941834 -0.19866933  0.09983342  0.29552021]
@@ -311,12 +284,10 @@ add_sample_code(globals()["asin"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.asin(x)
         print(out.numpy())
         # [-0.41151685 -0.20135792  0.10016742  0.30469265]
@@ -327,12 +298,10 @@ add_sample_code(globals()["cosh"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.cosh(x)
         print(out.numpy())
         # [1.08107237 1.02006676 1.00500417 1.04533851]
@@ -343,12 +312,10 @@ add_sample_code(globals()["sinh"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.sinh(x)
         print(out.numpy())
         # [-0.41075233 -0.201336    0.10016675  0.30452029]
@@ -359,12 +326,10 @@ add_sample_code(globals()["round"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.5, -0.2, 0.6, 1.5])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.5, -0.2, 0.6, 1.5])
         out = paddle.round(x)
         print(out.numpy())
         # [-1. -0.  1.  2.]
@@ -375,12 +340,10 @@ add_sample_code(globals()["reciprocal"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.reciprocal(x)
         print(out.numpy())
         # [-2.5        -5.         10.          3.33333333]
@@ -391,12 +354,10 @@ add_sample_code(globals()["square"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.square(x)
         print(out.numpy())
         # [0.16 0.04 0.01 0.09]
@@ -409,11 +370,10 @@ Examples:
 
         import paddle
         import paddle.nn.functional as F
-        import numpy as np
 
         paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
 
 """)
@@ -424,11 +384,10 @@ Examples:
 
         import paddle
         import paddle.nn.functional as F
-        import numpy as np
 
         paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
 
 """)
@@ -761,11 +720,9 @@ Examples:
     
     .. code-block:: python
     
-        import numpy as np
         import paddle
         paddle.disable_static()
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_tensor(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.erf(x)
         print(out.numpy())
         # [-0.42839236 -0.22270259  0.11246292  0.32862676]
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 77a78eb4a14a0a5ad9be9cff71131ca473106ab8..a90551c1b7b4fd45ae9a0e1cfa225a87db811295 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -29,6 +29,7 @@ from ..data_feeder import check_variable_and_dtype, check_type, check_dtype, con
 from paddle.utils import deprecated
 import numpy
 import warnings
+from .utils import check_shape
 
 __all__ = [
     'create_tensor', 'create_parameter', 'create_global_var', 'cast',
@@ -276,11 +277,6 @@ def concat(input, axis=0, name=None):
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
-    Raises:
-        TypeError: ``input`` must be one of list, tuple or Tensor.
-        TypeError: The data type of ``input`` must be one of bool, float16, float32, float64, int32 and int64. 
-        TypeError: The ``axis`` must be int or Tensor. The dtype of ``axis`` must be int32 or int64 when it's a Tensor.
-        TypeError: All the Tensors in ``input`` must have the same data type.
 
     Returns:
         Tensor: A Tensor with the same data type as ``input``.
@@ -657,12 +653,6 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     Returns:
         Tensor: Tensor which is created according to shape and dtype.
 
-    Raises:
-        TypeError: The dtype must be one of bool, float16, float32, float64, int32 and int64
-            and the data type of ``out`` must be the same as the ``dtype``. 
-        TypeError: The shape must be one of list, tuple and Tensor, the data type of ``shape``
-            must be int32 or int64 when ``shape`` is a Tensor
-
     Examples:
         .. code-block:: python
 
@@ -694,7 +684,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             attrs['str_value'] = str(float(value))
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         if out is None:
             out = _varbase_creator(dtype=dtype)
 
@@ -718,20 +708,18 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             value = cast(value, dtype)
         inputs['ValueTensor'] = value
 
+    check_shape(shape)
     check_dtype(dtype, 'dtype',
                 ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
                 'fill_constant')
     check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant')
 
-    if isinstance(shape, Variable):
-        check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'fill_constant')
-
     if out is not None:
         check_variable_and_dtype(out, 'out', [convert_dtype(dtype)],
                                  'fill_constant')
 
     helper = LayerHelper("fill_constant", **locals())
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type='fill_constant')
 
     if out is None:
@@ -1050,10 +1038,6 @@ def ones(shape, dtype, force_cpu=False):
 
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 1.
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
 
     Examples:
         .. code-block:: python
@@ -1086,10 +1070,6 @@ def zeros(shape, dtype, force_cpu=False, name=None):
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 0.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
     Examples:
         .. code-block:: python
 
@@ -1453,14 +1433,6 @@ def linspace(start, stop, num, dtype=None, name=None):
         the data shape of this tensor is :math:`[num]` . If the :attr:`num` is set 1, the output tensor just has \
         the value with input :attr:`start`. 
 
-    Raises:
-        TypeError: The ``dtype`` must be one of int32, int64, float32 and float64.
-        TypeError: The type of ``num`` must be int When it's not a Tensor.
-        TypeError: The data type of ``num`` must be int32  When it's  a Tensor.
-        TypeError: The data type of ``start`` and  ``stop`` must be same as ``dtype`` When it's  a Tensor.
-
-
-
     Examples:
         .. code-block:: python
 
@@ -1474,6 +1446,8 @@ def linspace(start, stop, num, dtype=None, name=None):
     tensor_num = num
     tensor_start = start
     tensor_stop = stop
+    if not isinstance(num, Variable):
+        check_type(num, 'num', (int), 'linspace')
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
     if not isinstance(start, Variable):
@@ -1488,21 +1462,32 @@ def linspace(start, stop, num, dtype=None, name=None):
 
     helper = LayerHelper("linspace", **locals())
 
+    start_dtype = convert_dtype(tensor_start.dtype)
+    stop_dtype = convert_dtype(tensor_stop.dtype)
+    out_dtype = convert_dtype(dtype)
     if isinstance(start, Variable):
-        check_dtype(start.dtype, 'start', (convert_dtype(dtype)), 'linspace')
+        check_dtype(start.dtype, 'start',
+                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
     else:
         check_type(start, 'start', (int, float), 'linspace')
 
     if isinstance(stop, Variable):
-        check_dtype(stop.dtype, 'stop', (convert_dtype(dtype)), 'linspace')
+        check_dtype(stop.dtype, 'stop',
+                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
     else:
         check_type(stop, 'stop', (int, float), 'linspace')
     if isinstance(num, Variable):
         check_dtype(num.dtype, 'num', ['int32'], 'linspace')
-    else:
-        check_type(num, 'num', (int), 'linspace')
     check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
                 'linspace')
+    if ((stop_dtype == "float64" or start_dtype == "float64") and
+            out_dtype in ["float32", "int32"]) or ((stop_dtype == "int64" or
+                                                    start_dtype == "int64") and
+                                                   out_dtype == "int32"):
+        raise ValueError(
+            "The dtype of start/stop is {}/{} but the attr(dtype) of linspace is {}, "
+            "which may cause data type overflows. Please reset attr(dtype) of linspace."
+            .format(start_dtype, stop_dtype, dtype))
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
@@ -1629,9 +1614,6 @@ def eye(num_rows,
 
     Returns:
         Tensor: An identity Tensor or LoDTensor of shape batch_shape + [num_rows, num_columns].
-    Raises:
-        TypeError: The `dtype` must be one of float16, float32, float64, int32 and int64.
-        TypeError: The `num_columns` must be non-negative int.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index 0d6965239e14b92d3d4997a9cf8efbe3fa7048b7..2095c9957e75b94396e573eba341f4cfded5dbc8 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -20,6 +20,7 @@ import numpy as np
 from ..framework import Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..layer_helper import LayerHelper
+from sys import version_info
 
 
 def convert_to_list(value, n, name, dtype=np.int):
@@ -282,7 +283,7 @@ def _contain_var(list_or_tuple):
     return False
 
 
-def _get_shape_tensor_inputs(inputs, attrs, shape, op_type):
+def get_shape_tensor_inputs(inputs, attrs, shape, op_type):
     from .tensor import fill_constant, cast
 
     def _get_attr_shape(list_shape):
@@ -347,7 +348,7 @@ def _convert_to_tensor_list(old_list, dtype="int32"):
     return new_list_tensor
 
 
-def _convert_shape_to_list(shape):
+def convert_shape_to_list(shape):
     """
     Convert shape(list, tuple, variable) to list in imperative mode
     """
@@ -358,3 +359,22 @@ def _convert_shape_to_list(shape):
     else:
         shape = list(shape.numpy().astype(int))
     return shape
+
+
+def check_shape(shape):
+    """
+    Check shape type and shape elements type before passing it to fill_constant
+    """
+    if isinstance(shape, Variable):
+        check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'fill_constant')
+    else:
+        for ele in shape:
+            if not isinstance(ele, Variable):
+                if ele < 0:
+                    raise ValueError(
+                        "All elements in ``shape`` must be positive when it's a list or tuple"
+                    )
+                if not isinstance(ele, six.integer_types):
+                    raise TypeError(
+                        "All elements in ``shape`` must be integers when it's a list or tuple"
+                    )
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 9e2d77df777d761b6904d8916c7a35fb8e6bfaba..8b37cfef3890eace0ff5141eeb91d85e78f1c964 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -170,7 +170,7 @@ class Optimizer(object):
         return state_dict
 
     @framework.dygraph_only
-    def set_dict(self, state_dict):
+    def set_state_dict(self, state_dict):
         '''
         Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
 
@@ -182,20 +182,22 @@ class Optimizer(object):
         Examples:
             .. code-block:: python
 
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
+                import paddle   
+
+                paddle.disable_static()
+
+                emb = paddle.nn.Embedding([10, 10])
 
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph(state_dict, "paddle_dy")
+                state_dict = emb.state_dict()
+                paddle.save(state_dict, "paddle_dy")
 
-                    adam = fluid.optimizer.Adam(learning_rate=fluid.layers.noam_decay( 100, 10000), 
+                adam = paddle.optimizer.Adam(learning_rate=fluid.layers.noam_decay( 100, 10000), 
                                                 parameter_list=emb.parameters())
-                    state_dict = adam.state_dict()
-                    fluid.save_dygraph(state_dict, "paddle_dy")
+                state_dict = adam.state_dict()
 
-                    para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy")
+                para_state_dict, opti_state_dict = paddle.load("paddle_dy")
 
-                    adam.set_dict(opti_state_dict)
+                adam.set_state_dict(opti_state_dict)
 
         '''
         from paddle.optimizer.lr_scheduler import _LRScheduler
@@ -257,6 +259,9 @@ class Optimizer(object):
 
                 tensor.set(load_para_np, framework._current_expected_place())
 
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+
     def get_opti_var_name_list(self):
         return self._opti_name_list
 
@@ -4595,15 +4600,16 @@ class RecomputeOptimizer(Optimizer):
             ), "_checkpoints should be a list of Variable or a list of String"
         self._checkpoints = checkpoints
 
-    def load(self, stat_dict):
+    @framework.deprecate_stat_dict
+    def load(self, state_dict):
         """
-	:api_attr: Static Graph
+	    :api_attr: Static Graph
 
         load function is not supported by Recompute Optimizer for now.
         :return: None
 
         Args:
-            stat_dict: the dict load by load_persistable method
+            state_dict: the dict load by load_persistable method
 
         Examples:
             .. code-block:: python
@@ -4627,8 +4633,8 @@ class RecomputeOptimizer(Optimizer):
                 sgd = fluid.optimizer.RecomputeOptimizer(sgd)
                 sgd._set_checkpoints([fc_1, pred])
                 try:
-                    stat_dict = {}
-                    sgd.load(stat_dict)
+                    state_dict = {}
+                    sgd.load(state_dict)
                 except NotImplementedError as e:
                     print(cpt.get_exception_message(e))
         """
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index a25cba029dd8bac81d6b00c1d9fb710f421ce9d0..935813251930b8093dd0c8a9f11ac30772133d20 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -47,6 +47,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
@@ -458,6 +459,7 @@ if(WITH_DISTRIBUTE)
     	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
 	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
+	   py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
         if(NOT WIN32)
             py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
             py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
diff --git a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
index 863c001f226f86384e2820cb6877ded48cffa119..15e98481c26b20de4e9fa493fa022380ba1fcd63 100644
--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
@@ -153,7 +153,7 @@ def gen_fake_line(dnn_data_num=7,
     return line
 
 
-def prepare_fake_data(file_nums=8, file_lines=1000):
+def prepare_fake_data(file_nums=9, file_lines=1000):
     """
     Create fake data with same type as avazu_ctr_data
     """
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index f37dff060cd539d400e52317dc1f3dce0c350ed9..b0bf203da664ae2a886ce90c56013300e25372c5 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -178,7 +178,7 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
         exe.run(fluid.default_startup_program())
         fleet.init_worker()
 
-        thread_num = 1
+        thread_num = int(os.getenv("CPU_NUM", 2))
         batch_size = 128
         filelist = fleet_util.get_file_shard(train_file_list)
         print("filelist: {}".format(filelist))
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
index 75bff108dd43665df0fc1c8b166a935946b4fbc7..ba0adaf32e15db71162aed71c042100a0cd50e26 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
@@ -81,7 +81,7 @@ class PredictorTools(object):
                 tensor.set_lod(feed_data.lod())
 
         # ensure no diff in multiple repeat times
-        repeat_time = 10
+        repeat_time = 2
         for i in range(repeat_time):
             predictor.zero_copy_run()
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 949286f63efb3357325f25b02f60e938eebd28e8..0b8df63d666b6547d5dccfc2ce0b420d653cc544 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -19,7 +19,7 @@ import paddle
 import paddle.fluid as fluid
 from paddle.static import InputSpec
 from paddle.fluid.dygraph import to_variable, declarative, ProgramTranslator, Layer, jit
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import ConcreteProgram
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ConcreteProgram, StaticLayer
 
 from test_basic_api_transformation import dyfunc_to_variable
 
@@ -84,6 +84,23 @@ class SimpleNet(Layer):
         return z
 
 
+class TestStaticLayerInstance(unittest.TestCase):
+    def test_instance_same_class(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            net_1 = SimpleNet()
+            net_2 = SimpleNet()
+
+            self.assertTrue(isinstance(net_1.forward, StaticLayer))
+            self.assertTrue(isinstance(net_2.forward, StaticLayer))
+            self.assertNotEqual(net_1.forward, net_2.forward)
+
+            # convert layer into static progam of net_1
+            net_1.forward.concrete_program
+            self.assertTrue(len(net_1.forward.program_cache) == 1)
+            # check no conversion applid with net_2
+            self.assertTrue(len(net_2.forward.program_cache) == 0)
+
+
 class TestInputSpec(unittest.TestCase):
     def setUp(self):
         pass
@@ -224,7 +241,6 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase):
         # 1. specific InputSpec for `x`/`y`
         concrete_program_1 = foo.get_concrete_program(
             InputSpec([None, 10]), InputSpec([10]))
-        print(concrete_program_1)
         self.assertTrue(len(foo.program_cache) == 1)
 
         # 2. specific `c`/`d` explicitly with same default value
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
index 6cf59c030c00384b225d5d13160f68a3558084b9..cf7708c675aa9c1fb8faf5f8585b458be88b6c83 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -133,7 +133,7 @@ class TestPartialProgramRaiseError(unittest.TestCase):
             x = fluid.dygraph.to_variable(x_data)
             out = net(x)
 
-            program_cache = SimpleFcLayer.forward.program_cache
+            program_cache = net.forward.program_cache
             _, (concrete_program, _) = program_cache.last()
 
             params = concrete_program.parameters
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5715a0d0afcf59ebbe1cc95a6b06dead64c6e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+import os
+from paddle.fluid.layer_helper import LayerHelper
+
+
+def check():
+    print("check: fluid.core.globals()['FLAGS_use_mkldnn']=",
+          fluid.core.globals()["FLAGS_use_mkldnn"])
+    print("check: fluid.get_flags('FLAGS_use_mkldnn')=",
+          fluid.get_flags(['FLAGS_use_mkldnn']))
+    print("check: DNNL_VERBOSE=", os.environ['DNNL_VERBOSE'])
+    a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
+    helper = LayerHelper(fluid.unique_name.generate(str("test")), act="relu")
+    func = helper.append_activation
+    with fluid.dygraph.guard(fluid.core.CPUPlace()):
+        a = fluid.dygraph.to_variable(a_np)
+        res1 = func(a)
+        res2 = np.maximum(a_np, 0)
+    assert (np.array_equal(res1.numpy(), res2))
+
+
+if __name__ == '__main__':
+    try:
+        check()
+        for k, v in sorted(os.environ.items()):
+            print(k + ':', v)
+        print('\n')
+    except Exception as e:
+        print(e)
+        print(type(e))
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..69676d0d70bdd523652c30c4cf066dc6982c46d4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import unittest
+import os
+import sys
+import subprocess
+
+
+class TestFlagsUseMkldnn(unittest.TestCase):
+    def setUp(self):
+        self._python_interp = sys.executable
+        self._python_interp += " check_flags_use_mkldnn.py"
+
+        self.env = os.environ.copy()
+        self.env[str("GLOG_v")] = str("3")
+        self.env[str("DNNL_VERBOSE")] = str("1")
+        self.env[str("FLAGS_use_mkldnn")] = str("1")
+
+    def test_flags_use_mkl_dnn(self):
+        cmd = self._python_interp
+
+        proc = subprocess.Popen(
+            cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=self.env)
+
+        out, err = proc.communicate()
+        returncode = proc.returncode
+
+        print('out', out)
+        print('err', err)
+
+        assert returncode == 0
+        # in python3, type(out) is 'bytes', need use encode
+        assert out.find(
+            "dnnl_verbose,exec,cpu,eltwise,jit:avx512_common,forward_training,"
+            "data_f32::blocked:abc:f0 diff_undef::undef::f0,,alg:eltwise_relu".
+            encode()) != -1
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
index aff13f0b555299d1c7b453b61be79f5a356a5416..b083e76897cd96cea93d7b90898541de1226ac15 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -114,8 +114,8 @@ class TestMnist(TestParallelDyGraphRunnerBase):
         model = MNIST()
         train_reader = paddle.batch(
             paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
-        opt = fluid.optimizer.Adam(
-            learning_rate=1e-3, parameter_list=model.parameters())
+        opt = paddle.optimizer.Adam(
+            learning_rate=1e-3, parameters=model.parameters())
         return model, train_reader, opt
 
     def run_one_loop(self, model, opt, data):
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
index 5a135cea52903a0d896df2d446b58d99e5a18993..424406c15bb18bade54a9b11bfdd96862d4df85c 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
@@ -94,6 +94,10 @@ class TestPool1d_API(unittest.TestCase):
             result = ada_max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+            result = paddle.nn.functional.common.interpolate(
+                input, mode="area", size=16)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_adaptive_avg_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
index 55c30e3d2ade0725e6debcdd0a69ca4eee622aec..e3c70884ebcf116feb4f5b0aa808c71e4b7f8c4e 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -163,6 +163,9 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
             out_5 = paddle.nn.functional.adaptive_avg_pool2d(
                 x=x, output_size=[None, 3])
 
+            out_6 = paddle.nn.functional.interpolate(
+                x=x, mode="area", size=[2, 5])
+
             assert np.allclose(out_1.numpy(), self.res_1_np)
 
             assert np.allclose(out_2.numpy(), self.res_2_np)
@@ -173,6 +176,8 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
 
             assert np.allclose(out_5.numpy(), self.res_5_np)
 
+            assert np.allclose(out_6.numpy(), self.res_3_np)
+
 
 class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
index c04ee660667edaff01d7029e83b912c05429a15f..a3c9dd91a69ea83b08c3f817403620460333b5e9 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -178,6 +178,9 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
             out_5 = paddle.nn.functional.adaptive_avg_pool3d(
                 x=x, output_size=[None, 3, None])
 
+            out_6 = paddle.nn.functional.interpolate(
+                x=x, mode="area", size=[2, 3, 5])
+
             assert np.allclose(out_1.numpy(), self.res_1_np)
 
             assert np.allclose(out_2.numpy(), self.res_2_np)
@@ -188,6 +191,8 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
 
             assert np.allclose(out_5.numpy(), self.res_5_np)
 
+            assert np.allclose(out_6.numpy(), self.res_3_np)
+
 
 class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
index 55612d71a17a7ae9801535bf5a35c83b100aab30..d3e990ca13eb2911ea04ed546b91f58e2db4e440 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -49,7 +49,6 @@ class TestAffineGridOp(OpTest):
         self.initTestCase()
         self.op_type = "affine_grid"
         theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
-        theta = np.ones(self.theta_shape).astype("float32")
         self.inputs = {'Theta': theta}
         self.attrs = {
             "use_cudnn": self.use_cudnn,
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
index 0fd9863948aedb64052e8fa0668f03600ae3197c..74f76030a29d2c9ce27278b61548c8877c1467ad 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
@@ -322,6 +322,20 @@ class TestArgMinMaxOpError(unittest.TestCase):
 
             self.assertRaises(TypeError, test_argmin_axis_type)
 
+            def test_argmax_dtype_type():
+                data = paddle.static.data(
+                    name="test_argmax", shape=[10], dtype="float32")
+                output = paddle.argmax(x=data, dtype=None)
+
+            self.assertRaises(ValueError, test_argmax_dtype_type)
+
+            def test_argmin_dtype_type():
+                data = paddle.static.data(
+                    name="test_argmin", shape=[10], dtype="float32")
+                output = paddle.argmin(x=data, dtype=None)
+
+            self.assertRaises(ValueError, test_argmin_dtype_type)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
index 01daea32167d28edbb46d6854872976aed79494e..b1ec74411987a73cf2e6a7d60aecce6c87ed598e 100644
--- a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
@@ -53,6 +53,8 @@ def value_bound(input, w, h, x, y):
 def bicubic_interp_np(input,
                       out_h,
                       out_w,
+                      scale_h=0,
+                      scale_w=0,
                       out_size=None,
                       actual_shape=None,
                       align_corners=True,
@@ -73,13 +75,19 @@ def bicubic_interp_np(input,
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
 
     if out_w > 1:
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_h, out_w))
 
@@ -128,7 +136,8 @@ class TestBicubicInterpOp(OpTest):
         self.init_test_case()
         self.op_type = "bicubic_interp_v2"
         input_np = np.random.random(self.input_shape).astype("float64")
-
+        scale_h = 0
+        scale_w = 0
         if self.data_layout == "NCHW":
             in_h = self.input_shape[2]
             in_w = self.input_shape[3]
@@ -151,9 +160,9 @@ class TestBicubicInterpOp(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = bicubic_interp_np(input_np, out_h, out_w, self.out_size,
-                                      self.actual_shape, self.align_corners,
-                                      self.data_layout)
+        output_np = bicubic_interp_np(input_np, out_h, out_w, scale_h, scale_w,
+                                      self.out_size, self.actual_shape,
+                                      self.align_corners, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -480,10 +489,34 @@ class TestBicubicOpError(unittest.TestCase):
                 out = interpolate(
                     x,
                     size=None,
-                    mode='trilinear',
+                    mode='bicubic',
                     align_corners=False,
                     scale_factor=[1, 2, 2])
 
+            def test_size_and_scale():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='bicubic',
+                    align_corners=False,
+                    scale_factor=None)
+
+            def test_size_and_scale2():
+                x = fluid.data(
+                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=[2, 2, 2],
+                    mode='trilinear',
+                    align_corners=False,
+                    scale_factor=2.0)
+
+            def test_size_type():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x, size={2, 2}, mode='bicubic', align_corners=False)
+
             self.assertRaises(ValueError, test_mode_type)
             self.assertRaises(ValueError, test_input_shape)
             self.assertRaises(TypeError, test_align_corcers)
@@ -498,6 +531,9 @@ class TestBicubicOpError(unittest.TestCase):
             self.assertRaises(ValueError, test_align_corners_and_nearest)
             self.assertRaises(ValueError, test_scale_shape)
             self.assertRaises(ValueError, test_scale_value)
+            self.assertRaises(ValueError, test_size_and_scale)
+            self.assertRaises(ValueError, test_size_and_scale2)
+            self.assertRaises(TypeError, test_size_type)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
index d139a53c7e2ccc68964457f3142b4ed890d339f2..9fc4971fec23923a40080613612d3a1843a86d2e 100755
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -26,6 +26,8 @@ import paddle
 def bilinear_interp_np(input,
                        out_h,
                        out_w,
+                       scale_w=0,
+                       scale_h=0,
                        out_size=None,
                        actual_shape=None,
                        align_corners=True,
@@ -47,12 +49,18 @@ def bilinear_interp_np(input,
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
     if out_w > 1:
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_h, out_w))
 
@@ -110,7 +118,8 @@ class TestBilinearInterpOp(OpTest):
         else:
             in_h = self.input_shape[1]
             in_w = self.input_shape[2]
-
+        scale_h = 0
+        scale_w = 0
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0.:
@@ -126,9 +135,9 @@ class TestBilinearInterpOp(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners,
-                                       self.align_mode, self.data_layout)
+        output_np = bilinear_interp_np(
+            input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape,
+            self.align_corners, self.align_mode, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -238,6 +247,17 @@ class TestBilinearInterpCase6(TestBilinearInterpOp):
         self.align_mode = 1
 
 
+class TestBilinearInterpCase7(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [2.0, 0.5]
+        self.align_corners = False
+        self.align_mode = 1
+
+
 class TestBilinearInterpSame(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
@@ -298,9 +318,9 @@ class TestBilinearInterpOpUint8(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners,
-                                       self.align_mode)
+        output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0,
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners, self.align_mode)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -481,8 +501,9 @@ class TestBilinearInterpOp_attr_tensor(OpTest):
             if isinstance(self.scale, list) and len(self.scale) == 1:
                 self.scale = [self.scale[0], self.scale[0]]
             self.attrs['scale'] = self.scale
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners)
+        output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0,
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners)
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
index da527b26bf0608da5a648d92b492ff27cf2802f0..35fce9e9d6ba9d7a2f264bdd5c1f3deb7a2a67e9 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -44,7 +44,7 @@ class Conv1dTestCase(unittest.TestCase):
         self.spartial_shape = spartial_shape
         self.filter_size = filter_size
         self.data_format = data_format
-        self.channel_last = (self.data_format == "NHWC")
+        self.channel_last = (self.data_format == "NLC")
 
         self.padding = padding
         self.padding_mode = padding_mode
@@ -147,6 +147,14 @@ class Conv1dErrorTestCase(Conv1dTestCase):
                 self.paddle_nn_layer()
 
 
+class Conv1dTypeErrorTestCase(Conv1dTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(TypeError):
+                self.paddle_nn_layer()
+
+
 def add_cases(suite):
     suite.addTest(Conv1dTestCase(methodName='runTest'))
     suite.addTest(Conv1dTestCase(methodName='runTest', stride=[1], dilation=2))
@@ -161,6 +169,7 @@ def add_cases(suite):
         Conv1dTestCase(
             methodName='runTest', padding=2, data_format='NLC'))
     suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1]))
+    suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1, 2]))
     suite.addTest(Conv1dTestCase(methodName='runTest', padding=2))
     suite.addTest(Conv1dTestCase(methodName='runTest'))
     suite.addTest(
@@ -178,7 +187,7 @@ def add_cases(suite):
 
 def add_error_cases(suite):
     suite.addTest(
-        Conv1dErrorTestCase(
+        Conv1dTypeErrorTestCase(
             methodName='runTest', padding_mode="reflect", padding="valid"))
     suite.addTest(
         Conv1dErrorTestCase(
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
index 73227dd3610376d85fcfc70bb2653dfd927427fd..4c98aacd209dab8e5dc9e7744922a927700c4bb3 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
@@ -201,6 +201,7 @@ def add_cases(suite):
         ConvTranspose1dTestCase(
             methodName='runTest', data_format="NLC", stride=3,
             output_padding=2))
+    suite.addTest(ConvTranspose1dTestCase(methodName='runTest', padding=[1, 2]))
 
 
 def add_error_cases(suite):
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c2520038a82a0b9427b2cbe1d4010a1bc8e040c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -0,0 +1,163 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import unittest
+import paddle.fluid.generator as generator
+
+import time  # temp for debug
+import paddle.fluid as fluid
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+
+class TestGeneratorSeed(unittest.TestCase):
+    """
+    Test cases for cpu generator seed.
+    """
+
+    def test_gen_dropout_dygraph(self):
+        gen = paddle.manual_seed(12343)
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(111111111)
+        st = paddle.get_cuda_rng_state()
+
+        x = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x_again = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x_third = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        print("x: {}".format(x.numpy()))
+        print("x_again: {}".format(x_again.numpy()))
+        x = x + x_again + x_third
+        y = fluid.layers.dropout(x, 0.5)
+
+        paddle.set_cuda_rng_state(st)
+
+        x1 = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x1_again = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x1_third = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x1 = x1 + x1_again + x1_third
+        y1 = fluid.layers.dropout(x1, 0.5)
+        y_np = y.numpy()
+        y1_np = y1.numpy()
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> dropout dygraph >>>>>>>")
+            self.assertTrue(np.allclose(y_np, y1_np))
+
+    def test_generator_gaussian_random_dygraph(self):
+        """Test Generator seed."""
+        fluid.enable_dygraph()
+
+        paddle.manual_seed(12312321111)
+        x = fluid.layers.gaussian_random([120], dtype="float32")
+        st1 = paddle.get_cuda_rng_state()
+        x1 = fluid.layers.gaussian_random([120], dtype="float32")
+        paddle.set_cuda_rng_state(st1)
+        x2 = fluid.layers.gaussian_random([120], dtype="float32")
+        paddle.manual_seed(12312321111)
+        x3 = fluid.layers.gaussian_random([120], dtype="float32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> gaussian random dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_randint_dygraph(self):
+        """Test Generator seed."""
+
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(12312321111)
+        x = paddle.randint(low=10, shape=[10], dtype="int32")
+        st1 = gen.get_state()
+        x1 = paddle.randint(low=10, shape=[10], dtype="int32")
+        gen.set_state(st1)
+        x2 = paddle.randint(low=10, shape=[10], dtype="int32")
+        paddle.manual_seed(12312321111)
+        x3 = paddle.randint(low=10, shape=[10], dtype="int32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> randint dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_gen_TruncatedNormal_initializer(self):
+        fluid.disable_dygraph()
+
+        gen = paddle.manual_seed(123123143)
+        cur_state = paddle.get_cuda_rng_state()
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x = fluid.layers.uniform_random(shape=[2, 10])
+            result_1 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+            result_2 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+        paddle.manual_seed(123123143)
+        with fluid.program_guard(train_program, startup_program):
+            exe.run(startup_program)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+        out1_res1 = np.array(out1[0])
+        out1_res2 = np.array(out1[1])
+        out2_res1 = np.array(out2[0])
+        out2_res2 = np.array(out2[1])
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> truncated normal static >>>>>>>")
+            self.assertTrue(np.allclose(out1_res1, out2_res1))
+            self.assertTrue(np.allclose(out1_res2, out2_res2))
+            self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 2919ec5e9ca97b1d59af46a54b2d702cb6de4a14..529fff158c55fc30248b9f5a88c8c615a8b55c79 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -43,7 +43,7 @@ class TestDirectory(unittest.TestCase):
             'paddle.distributed.prepare_context', 'paddle.DataParallel',
             'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
-            'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig',
+            'paddle.jit.save', 'paddle.jit.load', 'paddle.SaveLoadConfig',
             'paddle.NoamDecay', 'paddle.PiecewiseDecay',
             'paddle.NaturalExpDecay', 'paddle.ExponentialDecay',
             'paddle.InverseTimeDecay', 'paddle.PolynomialDecay',
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index faff81fa84fb5fa66c9ff14f782d2301e3964672..f4d368b6b6f52f3071320eaffbeedc8d14d63d2e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -488,6 +488,50 @@ class TestParallelDyGraphRunnerBase(object):
             model.clear_gradients()
         return out_losses
 
+    def run_gpu_fleet_api_trainer(self, args):
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
+        # 1. enable dygraph
+        paddle.disable_static()
+
+        # 2. init seed
+        seed = 90
+        paddle.static.default_startup_program().random_seed = seed
+        paddle.static.default_main_program().random_seed = seed
+        np.random.seed(seed)
+        random.seed = seed
+        # get trainer id
+        args.trainer_id = paddle.distributed.get_rank()
+
+        # 3. init parallel env
+        if args.update_method == "nccl2":
+            fleet.init(is_collective=True)
+
+        # 4. train model
+        model, train_reader, opt = self.get_model()
+        if args.update_method == "nccl2":
+            opt = fleet.distributed_optimizer(opt)
+            model = fleet.distributed_model(model)
+
+        out_losses = []
+        for step_id, data in enumerate(train_reader()):
+            data = self._get_data(data, args)
+            if step_id == RUN_STEP:
+                break
+            loss = self.run_one_loop(model, opt, data)
+            out_losses.append(loss.numpy())
+
+            if args.update_method == "nccl2":
+                loss = model.scale_loss(loss)
+
+            loss.backward()
+            if args.update_method == "nccl2":
+                model.apply_collective_grads()
+
+            opt.step()
+            opt.clear_grad()
+        print_to_out(out_losses)
+
 
 def runtime_main(test_class):
     parser = argparse.ArgumentParser(description='Run dist test.')
@@ -687,7 +731,8 @@ class TestDistBase(unittest.TestCase):
             envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
             cmd += " -m coverage run --branch -p"
 
-        cmd += " %s --role trainer --lr %f" % (model, self._lr)
+        cmd += " %s --role trainer --update_method local --lr %f" % (model,
+                                                                     self._lr)
 
         if batch_size != DEFAULT_BATCH_SIZE:
             cmd += " --batch_size %d" % batch_size
@@ -850,6 +895,7 @@ class TestDistBase(unittest.TestCase):
         if self.__use_cuda:
             tr_cmd += " --use_cuda"
             env.update({
+                "FLAGS_selected_gpus": "{}".format(0),
                 "CUDA_VISIBLE_DEVICES": "{}".format(trainer_id % 2),
                 "PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
                 "PADDLE_TRAINER_ID": "{}".format(trainer_id),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
index c3ffd50dc8da16f4a19c8da5383fe7f763aa7a72..02a739c060cd2bd58ecec4d7dc65b65e8a3a35a7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
@@ -36,13 +36,45 @@ class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "http_proxy": "",
-            "CPU_NUM": "1"
+            "CPU_NUM": "3"
         }
 
         required_envs.update(need_envs)
 
         if check_error_log:
-            required_envs["GLOG_v"] = "4"
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_heter_ctr.py", delta=1e-5, check_error_log=True)
+
+
+class TestDistHeterPyreaderAsync2x2(TestFleetHeterBase):
+    def _setup_config(self):
+        self._mode = "async"
+        self._reader = "pyreader"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "3"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
 
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
index 533ad9604cf0d879371796fb197e61e931fb479f..47a1c407230527d53327ba57d7b5d7a979bd7d49 100644
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -40,8 +40,11 @@ class DistributionNumpy():
 
 class UniformNumpy(DistributionNumpy):
     def __init__(self, low, high):
-        self.low = np.array(low).astype('float32')
-        self.high = np.array(high).astype('float32')
+        self.low = np.array(low)
+        self.high = np.array(high)
+        if str(self.low.dtype) not in ['float32', 'float64']:
+            self.low = self.low.astype('float32')
+            self.high = self.high.astype('float32')
 
     def sample(self, shape):
         shape = tuple(shape) + (self.low + self.high).shape
@@ -49,13 +52,13 @@ class UniformNumpy(DistributionNumpy):
                            (self.high - self.low))
 
     def log_prob(self, value):
-        lb = np.less(self.low, value).astype('float32')
-        ub = np.less(value, self.high).astype('float32')
+        lb = np.less(self.low, value).astype(self.low.dtype)
+        ub = np.less(value, self.high).astype(self.low.dtype)
         return np.log(lb * ub) - np.log(self.high - self.low)
 
     def probs(self, value):
-        lb = np.less(self.low, value).astype('float32')
-        ub = np.less(value, self.high).astype('float32')
+        lb = np.less(self.low, value).astype(self.low.dtype)
+        ub = np.less(value, self.high).astype(self.low.dtype)
         return (lb * ub) / (self.high - self.low)
 
     def entropy(self):
@@ -64,8 +67,11 @@ class UniformNumpy(DistributionNumpy):
 
 class NormalNumpy(DistributionNumpy):
     def __init__(self, loc, scale):
-        self.loc = np.array(loc).astype('float32')
-        self.scale = np.array(scale).astype('float32')
+        self.loc = np.array(loc)
+        self.scale = np.array(scale)
+        if str(self.loc.dtype) not in ['float32', 'float64']:
+            self.loc = self.loc.astype('float32')
+            self.scale = self.scale.astype('float32')
 
     def sample(self, shape):
         shape = tuple(shape) + (self.loc + self.scale).shape
@@ -83,8 +89,8 @@ class NormalNumpy(DistributionNumpy):
                       (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
 
     def entropy(self):
-        return 0.5 + 0.5 * np.log(np.array(2. * math.pi).astype(
-            'float32')) + np.log(self.scale)
+        return 0.5 + 0.5 * np.log(
+            np.array(2. * math.pi).astype(self.loc.dtype)) + np.log(self.scale)
 
     def kl_divergence(self, other):
         var_ratio = (self.scale / other.scale)
@@ -94,724 +100,571 @@ class NormalNumpy(DistributionNumpy):
         return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
 
 
-class DistributionTest(unittest.TestCase):
-    def setUp(self, use_gpu=False):
+class UniformTest(unittest.TestCase):
+    def setUp(self, use_gpu=False, batch_size=5, dims=6):
         self.use_gpu = use_gpu
         if not use_gpu:
-            place = fluid.CPUPlace()
+            self.place = fluid.CPUPlace()
             self.gpu_id = -1
         else:
-            place = fluid.CUDAPlace(0)
+            self.place = fluid.CUDAPlace(0)
             self.gpu_id = 0
-        self.executor = fluid.Executor(place)
-
-    def build_normal_common_net(self, batch_size, dims, sample_shape, loc_float,
-                                scale_float, other_loc_float, other_scale_float,
-                                scale_np, other_scale_np, loc_np, other_loc_np,
-                                loc, scale, other_loc, other_scale, values):
-        """Generate Normal object and get the output of its methods including
-        ``sample``, ``entropy``, ``log_prob``, ``probs`` and ``kl_divergence``.
-        Parameters ``loc`` and ``scale`` have different data types to test different situations.
-
-        Args:
-          batch_size(int): The first dimension of the shape of parameters(loc and scale).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          loc_float(float): Generated in function ``get_normal_random_input``, loc is a float number.
-          scale_float(float): Generated in function ``get_normal_random_input``, scale is a float number.
-          other_loc_float(float): Generated in function ``get_normal_random_input``, other_loc is a
-            float number. It is the first parameter in another Normal object used in ``kl_divergence``
-            method.
-          other_scale_float(float): Generated in function ``get_normal_random_input``, other_scale is a
-            float number. It is the second parameter in another Normal object used in ``kl_divergence``
-            method.
-          scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
-            whose shape is [batch_size, dims].
-          other_scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_scale_np
-            is an numpy array. It is the second parameter in another Normal object used in ``kl_divergence``
-            method.
-          loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
-            whose shape is [batch_size, dims].
-          other_loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_loc_np
-            is an numpy array. It is the first parameter in another Normal object used in ``kl_divergence``
-            method.
-          loc(Tensor): In dynamic mode, loc is generated in ``build_normal_dygraph``, it's a Tensor filled
-            with ``loc_np`` data. In static mode, loc is generated in ``build_normal_static``, ``layers.data``
-             method is used to get a Placeholder whose shape is [dims].
-          scale(Tensor): In dynamic mode, scale is generated in ``build_normal_dygraph``, it's a Tensor filled
-            with ``scale_np`` data. In static mode, scale is generated in ``build_normal_static``, ``layers.data``
-             method is used to get a Placeholder whose shape is [dims].
-          other_loc(Tensor): In dynamic mode, other_loc is generated in ``build_normal_dygraph``, it's a Tensor
-            filled with ``other_loc_np`` data. In static mode, other_loc is generated in ``build_normal_static``,
-             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the first parameter
-              in another Normal object used in ``kl_divergence`` method.
-          other_scale(Tensor): In dynamic mode, other_scale is generated in ``build_normal_dygraph``, it's a Tensor
-            filled with ``other_scale_np`` data. In static mode, other_scale is generated in ``build_normal_static``,
-             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the second parameter
-              in another Normal object used in ``kl_divergence`` method.
-          values(Tensor): In dynamic mode, values is generated in ``build_normal_dygraph``, it's a Tensor filled with
-             ``values_np`` data. In static mode, values is generated in ``build_normal_static``, ``layers.data``
-             method is used to get a Placeholder whose shape is [dims].
-
-        Returns:
-          List: The elements of the list are the output of sample, entropy, log_prob, probs, kl_divergence methods.
-          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be considered.
-
-        """
-        normal_int = Normal(int(loc_float), int(scale_float))
-        normal_float = Normal(loc_float, scale_float)
-        other_normal_float = Normal(other_loc_float, other_scale_float)
-
-        normal_float_np_broadcast = Normal(loc_float, scale_np)
-        other_normal_float_np_broadcast = Normal(other_loc_float,
-                                                 other_scale_np)
-
-        normal_np = Normal(loc_np, scale_np)
-        other_normal_np = Normal(other_loc_np, other_scale_np)
-
-        normal_variable = Normal(loc, scale)
-        other_normal_variable = Normal(other_loc, other_scale)
-
-        sample_int = normal_int.sample([batch_size, dims])
-        sample_float = normal_float.sample([batch_size, dims])
-        sample_float_np_broadcast = normal_float_np_broadcast.sample(
-            [batch_size, dims])
-        sample_np = normal_np.sample([batch_size, dims])
-        sample_variable = normal_variable.sample([batch_size, dims])
-
-        sample_int_diff = normal_int.sample([sample_shape])
-        sample_float_diff = normal_float.sample([sample_shape])
-        sample_float_np_broadcast_diff = normal_float_np_broadcast.sample(
-            [sample_shape])
-        sample_np_diff = normal_np.sample([sample_shape])
-        sample_variable_diff = normal_variable.sample([sample_shape])
-
-        entropy_int = normal_int.entropy()
-        entropy_float = normal_float.entropy()
-        entropy_float_np_broadcast = normal_float_np_broadcast.entropy()
-        entropy_np = normal_np.entropy()
-        entropy_variable = normal_variable.entropy()
-
-        lp_float_np_broadcast = normal_float_np_broadcast.log_prob(values)
-        lp_np = normal_np.log_prob(values)
-        lp_variable = normal_variable.log_prob(values)
-
-        p_float_np_broadcast = normal_float_np_broadcast.probs(values)
-        p_np = normal_np.probs(values)
-        p_variable = normal_variable.probs(values)
-
-        kl_float = normal_float.kl_divergence(other_normal_float)
-        kl_float_np_broadcast = normal_float_np_broadcast.kl_divergence(
-            other_normal_float_np_broadcast)
-        kl_np = normal_np.kl_divergence(other_normal_np)
-        kl_variable = normal_variable.kl_divergence(other_normal_variable)
-
-        fetch_list = [
-            sample_int, sample_float, sample_float_np_broadcast, sample_np,
-            sample_variable, sample_int_diff, sample_float_diff,
-            sample_float_np_broadcast_diff, sample_np_diff,
-            sample_variable_diff, entropy_int, entropy_float,
-            entropy_float_np_broadcast, entropy_np, entropy_variable,
-            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
-            p_np, p_variable, kl_float, kl_float_np_broadcast, kl_np,
-            kl_variable
-        ]
-        return fetch_list
-
-    def build_normal_static(self, test_program, batch_size, dims, sample_shape,
-                            loc_float, scale_float, other_loc_float,
-                            other_scale_float, scale_np, other_scale_np, loc_np,
-                            other_loc_np, values_np):
-        """
-        In static mode, generate feed data of Normal network, and get output fetch_list using
-        ``build_normal_common_net``.
-
-        Args:
-          test_program: In static mode, the Program object.
-          other args can refer to function ``build_normal_common_net``.
-
-        Returns:
-          feed_vars: The feed data of Normal network in static mode.
-          fetch_list: The output is generated by function ``build_normal_common_net``.
-        """
-        with fluid.program_guard(test_program):
-            loc = layers.data(name='loc', shape=[dims], dtype='float32')
-            scale = layers.data(name='scale', shape=[dims], dtype='float32')
-
-            other_loc = layers.data(
-                name='other_loc', shape=[dims], dtype='float32')
-            other_scale = layers.data(
-                name='other_scale', shape=[dims], dtype='float32')
 
-            values = layers.data(name='values', shape=[dims], dtype='float32')
+        self.init_numpy_data(batch_size, dims)
 
-            fetch_list = self.build_normal_common_net(
-                batch_size, dims, sample_shape, loc_float, scale_float,
-                other_loc_float, other_scale_float, scale_np, other_scale_np,
-                loc_np, other_loc_np, loc, scale, other_loc, other_scale,
-                values)
+        paddle.disable_static(self.place)
+        self.init_dynamic_data(batch_size, dims)
 
-        feed_vars = {
-            'loc': loc_np,
-            'scale': scale_np,
-            'other_loc': other_loc_np,
-            'other_scale': other_scale_np,
-            'values': values_np
-        }
-        return feed_vars, fetch_list
-
-    def build_normal_dygraph(self, batch_size, dims, sample_shape, loc_float,
-                             scale_float, other_loc_float, other_scale_float,
-                             scale_np, other_scale_np, loc_np, other_loc_np,
-                             values_np):
-        """
-        In dynamic mode, generate input data of Normal network, and get output fetch_list using
-        ``build_normal_common_net``.
-
-        Args:
-          refer to function ``build_normal_common_net``.
-
-        Returns:
-          fetch_list_numpy: The output is generated by function ``build_normal_common_net``. Transform
-          these tensor to numpy.ndarray.
-        """
-        loc = paddle.to_tensor(loc_np)
-        scale = paddle.to_tensor(scale_np)
-        other_loc = paddle.to_tensor(other_loc_np)
-        other_scale = paddle.to_tensor(other_scale_np)
-        values = paddle.to_tensor(values_np)
-
-        fetch_list = self.build_normal_common_net(
-            batch_size, dims, sample_shape, loc_float, scale_float,
-            other_loc_float, other_scale_float, scale_np, other_scale_np,
-            loc_np, other_loc_np, loc, scale, other_loc, other_scale, values)
-        fetch_list_numpy = [t.numpy() for t in fetch_list]
-        return fetch_list_numpy
-
-    def get_normal_random_input(self, batch_size, dims):
-        """
-        Generate input data ``loc`` and ``scale`` used in Normal network.
-
-        Args:
-          refer to function ``build_normal_common_net``.
-
-        Returns:
-          List: Different data type of ``loc`` and ``scale``, including float, numpy.ndarray.
-          By the way, ``other_loc`` and ``other_scale`` are used in ``kl_divergence`` method.
-          refer to ``args`` in function ``build_normal_common_net``.
-        """
-        loc_np = np.random.randn(batch_size, dims).astype('float32')
-        other_loc_np = np.random.randn(batch_size, dims).astype('float32')
-
-        loc_float = (np.random.ranf() - 0.5) * 4
-        scale_float = (np.random.ranf() - 0.5) * 4
-        while scale_float < 0:
-            scale_float = (np.random.ranf() - 0.5) * 4
-
-        other_loc_float = (np.random.ranf() - 0.5) * 4
-        other_scale_float = (np.random.ranf() - 0.5) * 4
-        while other_scale_float < 0:
-            other_scale_float = (np.random.ranf() - 0.5) * 4
-
-        scale_np = np.random.randn(batch_size, dims).astype('float32')
-        other_scale_np = np.random.randn(batch_size, dims).astype('float32')
-        values_np = np.random.randn(batch_size, dims).astype('float32')
-
-        while not np.all(scale_np > 0):
-            scale_np = np.random.randn(batch_size, dims).astype('float32')
-        while not np.all(other_scale_np > 0):
-            other_scale_np = np.random.randn(batch_size, dims).astype('float32')
-        return [
-            loc_np, other_loc_np, loc_float, scale_float, other_loc_float,
-            other_scale_float, scale_np, other_scale_np, values_np
-        ]
-
-    def compare_normal_with_numpy(self,
-                                  data_list,
-                                  output_list,
-                                  batch_size=2,
-                                  dims=3,
-                                  sample_shape=7,
-                                  tolerance=1e-6):
-        """
-        Compare the outputs of Normal's methods in paddle and numpy. If the outputs are not consistent,
-        raise errors.
-
-        Args:
-          data_list: Input data generated by function ``get_normal_random_input``.
-          output_list: The outputs of Normal's methods in static or dynamic mode.
-          batch_size(int): The first dimension of the shape of parameters(loc and scale).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          tolerance(float): The tolerance of the error.
-        """
-        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
-
-        np_normal_int = NormalNumpy(int(loc_float), int(scale_float))
-        np_normal_float = NormalNumpy(loc_float, scale_float)
-        np_other_normal_float = NormalNumpy(other_loc_float, other_scale_float)
-        np_normal_float_np_broadcast = NormalNumpy(loc_float, scale_np)
-        np_other_normal_float_np_broadcast = NormalNumpy(other_loc_float,
-                                                         other_scale_np)
-        np_normal = NormalNumpy(loc_np, scale_np)
-        np_other_normal = NormalNumpy(other_loc_np, other_scale_np)
-
-        gt_sample_int = np_normal_int.sample([batch_size, dims])
-        gt_sample_float = np_normal_float.sample([batch_size, dims])
-        gt_sample_float_np_broadcast = np_normal_float_np_broadcast.sample(
-            [batch_size, dims])
-        gt_sample_np = np_normal.sample([batch_size, dims])
-
-        gt_sample_int_diff = np_normal_int.sample([sample_shape])
-        gt_sample_float_diff = np_normal_float.sample([sample_shape])
-        gt_sample_float_np_broadcast_diff = np_normal_float_np_broadcast.sample(
-            [sample_shape])
-        gt_sample_np_diff = np_normal.sample([sample_shape])
-
-        gt_entropy_int = np_normal_int.entropy()
-        gt_entropy_float = np_normal_float.entropy()
-        gt_entropy_float_np_broadcast = np_normal_float_np_broadcast.entropy()
-        gt_entropy = np_normal.entropy()
-        gt_lp_float_np_broadcast = np_normal_float_np_broadcast.log_prob(
-            values_np)
-        gt_lp = np_normal.log_prob(values_np)
-        gt_p_float_np_broadcast = np_normal_float_np_broadcast.probs(values_np)
-        gt_p = np_normal.probs(values_np)
-        gt_kl_float = np_normal_float.kl_divergence(np_other_normal_float)
-        gt_kl_float_np_broadcast = np_normal_float_np_broadcast.kl_divergence(
-            np_other_normal_float_np_broadcast)
-        gt_kl = np_normal.kl_divergence(np_other_normal)
-
-        [
-            output_sample_int, output_sample_float,
-            output_sample_float_np_broadcast, output_sample_np,
-            output_sample_variable, output_sample_int_diff,
-            output_sample_float_diff, output_sample_float_np_broadcast_diff,
-            output_sample_np_diff, output_sample_variable_diff,
-            output_entropy_int, output_entropy_float,
-            output_entropy_float_np_broadcast, output_entropy_np,
-            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
-            output_lp_variable, output_p_float_np_broadcast, output_p_np,
-            output_p_variable, output_kl_float, output_kl_float_np_broadcast,
-            output_kl_np, output_kl_variable
-        ] = output_list
-
-        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
-        np.testing.assert_equal(output_sample_float.shape,
-                                gt_sample_float.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
-                                gt_sample_float_np_broadcast.shape)
-        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_variable.shape,
-                                gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_int_diff.shape,
-                                gt_sample_int_diff.shape)
-        np.testing.assert_equal(output_sample_float_diff.shape,
-                                gt_sample_float_diff.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
-                                gt_sample_float_np_broadcast_diff.shape)
-        np.testing.assert_equal(output_sample_np_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_equal(output_sample_variable_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_allclose(
-            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float_np_broadcast,
-            gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_float_np_broadcast,
-            gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_float_np_broadcast,
-            gt_p_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_float, gt_kl_float, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_float_np_broadcast,
-            gt_kl_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
+        paddle.enable_static()
+        self.test_program = fluid.Program()
+        self.executor = fluid.Executor(self.place)
+        self.init_static_data(batch_size, dims)
+
+    def init_numpy_data(self, batch_size, dims):
+        # low ans high are 'float'
+        self.low_np = np.random.uniform(-2, 1)
+        self.high_np = np.random.uniform(1, 3)
+        self.values_np = np.array([1.0]).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = self.low_np
+        self.dynamic_high = self.high_np
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[], dtype='float32')
+
+    def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
+        sample, entropy, log_prob, probs = fetch_list
+
+        np_uniform = UniformNumpy(self.low_np, self.high_np)
+        np_sample = np_uniform.sample([sample_shape])
+        np_entropy = np_uniform.entropy()
+        np_lp = np_uniform.log_prob(self.values_np)
+        np_p = np_uniform.probs(self.values_np)
+
+        np.testing.assert_equal(sample.shape, np_sample.shape)
         np.testing.assert_allclose(
-            output_kl_np, gt_kl, rtol=tolerance, atol=tolerance)
+            entropy, np_entropy, rtol=tolerance, atol=tolerance)
         np.testing.assert_allclose(
-            output_kl_variable, gt_kl, rtol=tolerance, atol=tolerance)
-
-    def test_normal_distribution_static(self,
-                                        batch_size=2,
-                                        dims=3,
-                                        sample_shape=7,
-                                        tolerance=1e-6):
-        """
-        Test Normal's methods in static mode.
-
-        Args:
-          refer to ``compare_normal_with_numpy`` function.
-        """
-        test_program = fluid.Program()
-        data_list = self.get_normal_random_input(batch_size, dims)
-        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
-
-        feed_vars, fetch_list = self.build_normal_static(
-            test_program, batch_size, dims, sample_shape, loc_float,
-            scale_float, other_loc_float, other_scale_float, scale_np,
-            other_scale_np, loc_np, other_loc_np, values_np)
-        self.executor.run(fluid.default_startup_program())
+            log_prob, np_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(probs, np_p, rtol=tolerance, atol=tolerance)
 
-        output_list = self.executor.run(program=test_program,
-                                        feed=feed_vars,
-                                        fetch_list=fetch_list)
-
-        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
-                                       sample_shape, tolerance)
-
-    def test_normal_distribution_dygraph(self,
-                                         batch_size=2,
-                                         dims=3,
-                                         sample_shape=7,
-                                         tolerance=1e-6):
-        """
-        Test Normal's methods in dynamic mode.
-
-        Args:
-          refer to ``compare_normal_with_numpy`` function.
-        """
-        paddle.disable_static()
-        data_list = self.get_normal_random_input(batch_size, dims)
-        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
-
-        output_list = self.build_normal_dygraph(
-            batch_size, dims, sample_shape, loc_float, scale_float,
-            other_loc_float, other_scale_float, scale_np, other_scale_np,
-            loc_np, other_loc_np, values_np)
-
-        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
-                                       sample_shape, tolerance)
+    def test_uniform_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
+        paddle.disable_static(self.place)
+        uniform = Uniform(self.dynamic_low, self.dynamic_high)
+        sample = uniform.sample([sample_shape]).numpy()
+        entropy = uniform.entropy().numpy()
+        log_prob = uniform.log_prob(self.dynamic_values).numpy()
+        probs = uniform.probs(self.dynamic_values).numpy()
+        fetch_list = [sample, entropy, log_prob, probs]
+
+        self.compare_with_numpy(fetch_list)
+
+    def test_uniform_distribution_static(self, sample_shape=7, tolerance=1e-6):
         paddle.enable_static()
+        with fluid.program_guard(self.test_program):
+            uniform = Uniform(self.static_low, self.static_high)
+            sample = uniform.sample([sample_shape])
+            entropy = uniform.entropy()
+            log_prob = uniform.log_prob(self.static_values)
+            probs = uniform.probs(self.static_values)
+            fetch_list = [sample, entropy, log_prob, probs]
 
-    def build_uniform_common_net(self, batch_size, dims, sample_shape,
-                                 low_float, high_float, high_np, low_np,
-                                 values_np, low, high, values):
-        """Generate Uniform object and get the output of its methods including ``sample``, ``entropy``,
-         ``log_prob`` and ``probs``.
-        Parameters ``low`` and ``high`` have different data types to test different situations.
-
-        Args:
-          batch_size(int): The first dimension of the shape of parameters(low and high).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          low_float(float): Parameter ``low`` is a float number.
-          high_float(float): Parameter ``high`` is a float number.
-          high_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
-          low_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
-          values_np(numpy.ndarray): The input of ``log_prob`` and ``probs`` methods. An numpy array whose
-            shape is [batch_size, dims].
-          low(Tensor): In dynamic mode, low is generated in ``build_uniform_dygraph``, it's a Tensor filled
-            with ``low_np`` data. In static mode, low is generated in ``build_uniform_static``.
-          high(Tensor): In dynamic mode, high is generated in ``build_uniform_dygraph``, it's a Tensor filled
-            with ``high_np`` data. In static mode, high is generated in ``build_uniform_static``.
-          values(Tensor): In dynamic mode, values is generated in ``build_uniform_dygraph``, it's a Tensor
-            filled with ``values_np`` data. In static mode, values is generated in ``build_uniform_static``.
-
-        Returns:
-          List: The elements of the list are the output of sample, entropy, log_prob, probs methods.
-          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be
-           considered.
-
-        """
-        uniform_int = Uniform(int(low_float), int(high_float))
-        uniform_float = Uniform(low_float, high_float)
-        uniform_float_np_broadcast = Uniform(low_float, high_np)
-        uniform_np = Uniform(low_np, high_np)
-        uniform_variable = Uniform(low, high)
-
-        sample_int = uniform_int.sample([batch_size, dims])
-        sample_float = uniform_float.sample([batch_size, dims])
-        sample_float_np_broadcast = uniform_float_np_broadcast.sample(
-            [batch_size, dims])
-        sample_np = uniform_np.sample([batch_size, dims])
-        sample_variable = uniform_variable.sample([batch_size, dims])
-
-        sample_int_diff = uniform_int.sample([sample_shape])
-        sample_float_diff = uniform_float.sample([sample_shape])
-        sample_float_np_broadcast_diff = uniform_float_np_broadcast.sample(
-            [sample_shape])
-        sample_np_diff = uniform_np.sample([sample_shape])
-        sample_variable_diff = uniform_variable.sample([sample_shape])
-
-        entropy_int = uniform_int.entropy()
-        entropy_float = uniform_float.entropy()
-        entropy_float_np_broadcast = uniform_float_np_broadcast.entropy()
-        entropy_np = uniform_np.entropy()
-        entropy_variable = uniform_variable.entropy()
-
-        lp_float_np_broadcast = uniform_float_np_broadcast.log_prob(values)
-        lp_np = uniform_np.log_prob(values)
-        lp_variable = uniform_variable.log_prob(values)
-
-        p_float_np_broadcast = uniform_float_np_broadcast.probs(values)
-        p_np = uniform_np.probs(values)
-        p_variable = uniform_variable.probs(values)
-
-        fetch_list = [
-            sample_int, sample_float, sample_float_np_broadcast, sample_np,
-            sample_variable, sample_int_diff, sample_float_diff,
-            sample_float_np_broadcast_diff, sample_np_diff,
-            sample_variable_diff, entropy_int, entropy_float,
-            entropy_float_np_broadcast, entropy_np, entropy_variable,
-            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
-            p_np, p_variable
-        ]
-        return fetch_list
-
-    def build_uniform_static(self, test_program, batch_size, dims, sample_shape,
-                             low_float, high_float, high_np, low_np, values_np):
-        """
-        In static mode, generate feed data of Uniform network, and get output fetch_list using
-        ``build_uniform_common_net``.
-
-        Args:
-          test_program: In static mode, the Program object.
-          other args can refer to function ``build_uniform_common_net``.
-
-        Returns:
-          feed_vars: The feed data of Uniform network in static mode.
-          fetch_list: The output is generated by function ``build_uniform_common_net``.
-        """
-        with fluid.program_guard(test_program):
-            low = layers.data(name='low', shape=[dims], dtype='float32')
-            high = layers.data(name='high', shape=[dims], dtype='float32')
-
-            values = layers.data(name='values', shape=[dims], dtype='float32')
-
-            fetch_list = self.build_uniform_common_net(
-                batch_size, dims, sample_shape, low_float, high_float, high_np,
-                low_np, values_np, low, high, values)
-
-        feed_vars = {'low': low_np, 'high': high_np, 'values': values_np}
-        return feed_vars, fetch_list
-
-    def build_uniform_dygraph(self, batch_size, dims, sample_shape, low_float,
-                              high_float, high_np, low_np, values_np):
-        """
-        In dynamic mode, generate input data of Uniform network, and get output fetch_list using
-        ``build_uniform_common_net``.
-
-        Args:
-          refer to function ``build_uniform_common_net``.
-
-        Returns:
-          fetch_list_numpy: The output is generated by function ``build_uniform_common_net``. Transform
-          these tensor to numpy.ndarray.
-        """
-        low = paddle.to_tensor(low_np)
-        high = paddle.to_tensor(high_np)
-        values = paddle.to_tensor(values_np)
-
-        fetch_list = self.build_uniform_common_net(
-            batch_size, dims, sample_shape, low_float, high_float, high_np,
-            low_np, values_np, low, high, values)
-        fetch_list_numpy = [t.numpy() for t in fetch_list]
-        return fetch_list_numpy
-
-    def compare_uniform_with_numpy(self,
-                                   data_list,
-                                   output_list,
-                                   batch_size=2,
-                                   dims=3,
-                                   sample_shape=7,
-                                   tolerance=1e-6):
-        """
-        Compare the outputs of Uniform's methods in paddle and numpy. If the outputs are not consistent,
-        raise errors.
-
-        Args:
-          data_list: Input data including float and numpy.ndarray type of ``low`` and ``high`` parameters.
-          output_list: The outputs of Uniform's methods in static or dynamic mode.
-          batch_size(int): The first dimension of the shape of parameters(low and high).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          tolerance(float): The tolerance of the error.
-        """
-        [low_np, low_float, high_float, high_np, values_np] = data_list
-
-        np_uniform_int = UniformNumpy(int(low_float), int(high_float))
-        np_uniform_float = UniformNumpy(low_float, high_float)
-        np_uniform_float_np_broadcast = UniformNumpy(low_float, high_np)
-        np_uniform = UniformNumpy(low_np, high_np)
-
-        gt_sample_int = np_uniform_int.sample([batch_size, dims])
-        gt_sample_float = np_uniform_float.sample([batch_size, dims])
-        gt_sample_float_np_broadcast = np_uniform_float_np_broadcast.sample(
-            [batch_size, dims])
-        gt_sample_np = np_uniform.sample([batch_size, dims])
-        gt_sample_int_diff = np_uniform_int.sample([sample_shape])
-        gt_sample_float_diff = np_uniform_float.sample([sample_shape])
-        gt_sample_float_np_broadcast_diff = np_uniform_float_np_broadcast.sample(
-            [sample_shape])
-        gt_sample_np_diff = np_uniform.sample([sample_shape])
-        gt_entropy_int = np_uniform_int.entropy()
-        gt_entropy_float = np_uniform_float.entropy()
-        gt_entropy_float_np_broadcast = np_uniform_float_np_broadcast.entropy()
-        gt_entropy = np_uniform.entropy()
-        gt_lp_float_np_broadcast = np_uniform_float_np_broadcast.log_prob(
-            values_np)
-        gt_lp = np_uniform.log_prob(values_np)
-        gt_p_float_np_broadcast = np_uniform_float_np_broadcast.probs(values_np)
-        gt_p = np_uniform.probs(values_np)
-
-        [
-            output_sample_int, output_sample_float,
-            output_sample_float_np_broadcast, output_sample_np,
-            output_sample_variable, output_sample_int_diff,
-            output_sample_float_diff, output_sample_float_np_broadcast_diff,
-            output_sample_np_diff, output_sample_variable_diff,
-            output_entropy_int, output_entropy_float,
-            output_entropy_float_np_broadcast, output_entropy_np,
-            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
-            output_lp_variable, output_p_float_np_broadcast, output_p_np,
-            output_p_variable
-        ] = output_list
-
-        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
-        np.testing.assert_equal(output_sample_float.shape,
-                                gt_sample_float.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
-                                gt_sample_float_np_broadcast.shape)
-        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_variable.shape,
-                                gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_int_diff.shape,
-                                gt_sample_int_diff.shape)
-        np.testing.assert_equal(output_sample_float_diff.shape,
-                                gt_sample_float_diff.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
-                                gt_sample_float_np_broadcast_diff.shape)
-        np.testing.assert_equal(output_sample_np_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_equal(output_sample_variable_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_allclose(
-            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float_np_broadcast,
-            gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_float_np_broadcast,
-            gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_float_np_broadcast,
-            gt_p_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
+        feed_vars = {
+            'low': self.low_np,
+            'high': self.high_np,
+            'values': self.values_np
+        }
+
+        self.executor.run(fluid.default_startup_program())
+        fetch_list = self.executor.run(program=self.test_program,
+                                       feed=feed_vars,
+                                       fetch_list=fetch_list)
+
+        self.compare_with_numpy(fetch_list)
+
+
+class UniformTest2(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low ans high are 'int'
+        self.low_np = int(np.random.uniform(-2, 1))
+        self.high_np = int(np.random.uniform(1, 3))
+        self.values_np = np.array([1.0]).astype('float32')
+
+
+class UniformTest3(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # test broadcast: low is float, high is numpy.ndarray with dtype 'float32'.
+        self.low_np = np.random.uniform(-2, 1)
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest4(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are numpy.ndarray with dtype 'float32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float32')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest5(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are numpy.ndarray with dtype 'float64'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = self.low_np
+        self.dynamic_high = self.high_np
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class UniformTest6(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float32')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np)
+        self.dynamic_high = paddle.to_tensor(self.high_np)
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float32')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float32')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest7(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP64'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np, dtype='float64')
+        self.dynamic_high = paddle.to_tensor(self.high_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float64')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class UniformTest8(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np, dtype='float64')
+        self.dynamic_high = paddle.to_tensor(self.high_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float32')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float64')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest(unittest.TestCase):
+    def setUp(self, use_gpu=False, batch_size=2, dims=3):
+        self.use_gpu = use_gpu
+        if not use_gpu:
+            self.place = fluid.CPUPlace()
+            self.gpu_id = -1
+        else:
+            self.place = fluid.CUDAPlace(0)
+            self.gpu_id = 0
+
+        self.init_numpy_data(batch_size, dims)
+
+        paddle.disable_static(self.place)
+        self.init_dynamic_data(batch_size, dims)
+
+        paddle.enable_static()
+        self.test_program = fluid.Program()
+        self.executor = fluid.Executor(self.place)
+        self.init_static_data(batch_size, dims)
+
+    def init_numpy_data(self, batch_size, dims):
+        # loc ans scale are 'float'
+        self.loc_np = (np.random.ranf() - 0.5) * 4
+        self.scale_np = (np.random.ranf() - 0.5) * 4
+        while self.scale_np < 0:
+            self.scale_np = (np.random.ranf() - 0.5) * 4
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = (np.random.ranf() - 0.5) * 4
+        self.other_scale_np = (np.random.ranf() - 0.5) * 4
+        while self.other_scale_np < 0:
+            self.other_scale_np = (np.random.ranf() - 0.5) * 4
+        self.values_np = np.random.ranf(1).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = self.loc_np
+        self.dynamic_scale = self.scale_np
+        self.dynamic_other_loc = self.other_loc_np
+        self.dynamic_other_scale = self.other_scale_np
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[], dtype='float32')
+
+    def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
+        sample, entropy, log_prob, probs, kl = fetch_list
+
+        np_normal = NormalNumpy(self.loc_np, self.scale_np)
+        np_sample = np_normal.sample([sample_shape])
+        np_entropy = np_normal.entropy()
+        np_lp = np_normal.log_prob(self.values_np)
+        np_p = np_normal.probs(self.values_np)
+        np_other_normal = NormalNumpy(self.other_loc_np, self.other_scale_np)
+        np_kl = np_normal.kl_divergence(np_other_normal)
+
+        np.testing.assert_equal(sample.shape, np_sample.shape)
         np.testing.assert_allclose(
-            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
+            entropy, np_entropy, rtol=tolerance, atol=tolerance)
         np.testing.assert_allclose(
-            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
-
-    def test_uniform_distribution_static(self,
-                                         batch_size=2,
-                                         dims=3,
-                                         sample_shape=7,
-                                         tolerance=1e-6):
-        """
-        Test Uniform's methods in static mode.
-
-        Args:
-          refer to ``compare_uniform_with_numpy`` function.
-        """
-        test_program = fluid.Program()
-
-        low_np = np.random.randn(batch_size, dims).astype('float32')
-        low_float = np.random.uniform(-2, 1)
-        high_float = np.random.uniform(1, 3)
-        high_np = np.random.uniform(-5.0, 5.0,
-                                    (batch_size, dims)).astype('float32')
-        values_np = np.random.randn(batch_size, dims).astype('float32')
-
-        data_list = [low_np, low_float, high_float, high_np, values_np]
-
-        feed_vars, fetch_list = self.build_uniform_static(
-            test_program, batch_size, dims, sample_shape, low_float, high_float,
-            high_np, low_np, values_np)
+            log_prob, np_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(probs, np_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(kl, np_kl, rtol=tolerance, atol=tolerance)
 
-        self.executor.run(fluid.default_startup_program())
+    def test_normal_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
+        paddle.disable_static(self.place)
+        normal = Normal(self.dynamic_loc, self.dynamic_scale)
+
+        sample = normal.sample([sample_shape]).numpy()
+        entropy = normal.entropy().numpy()
+        log_prob = normal.log_prob(self.dynamic_values).numpy()
+        probs = normal.probs(self.dynamic_values).numpy()
+        other_normal = Normal(self.dynamic_other_loc, self.dynamic_other_scale)
+        kl = normal.kl_divergence(other_normal).numpy()
 
-        # result calculated by paddle
-        output_list = self.executor.run(program=test_program,
-                                        feed=feed_vars,
-                                        fetch_list=fetch_list)
-        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
-                                        dims, sample_shape, tolerance)
-
-    def test_uniform_distribution_dygraph(self,
-                                          batch_size=2,
-                                          dims=3,
-                                          sample_shape=7,
-                                          tolerance=1e-6):
-        """
-        Test Uniform's methods in dynamic mode.
-
-        Args:
-          refer to ``compare_uniform_with_numpy`` function.
-        """
-        paddle.disable_static()
-
-        low_np = np.random.randn(batch_size, dims).astype('float32')
-        low_float = np.random.uniform(-2, 1)
-        high_float = np.random.uniform(1, 3)
-        high_np = np.random.uniform(-5.0, 5.0,
-                                    (batch_size, dims)).astype('float32')
-        values_np = np.random.randn(batch_size, dims).astype('float32')
-
-        data_list = [low_np, low_float, high_float, high_np, values_np]
-        output_list = self.build_uniform_dygraph(batch_size, dims, sample_shape,
-                                                 low_float, high_float, high_np,
-                                                 low_np, values_np)
-
-        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
-                                        dims, sample_shape, tolerance)
+        fetch_list = [sample, entropy, log_prob, probs, kl]
+        self.compare_with_numpy(fetch_list)
+
+    def test_normal_distribution_static(self, sample_shape=7, tolerance=1e-6):
         paddle.enable_static()
+        with fluid.program_guard(self.test_program):
+            normal = Normal(self.static_loc, self.static_scale)
+
+            sample = normal.sample([sample_shape])
+            entropy = normal.entropy()
+            log_prob = normal.log_prob(self.static_values)
+            probs = normal.probs(self.static_values)
+            other_normal = Normal(self.static_other_loc,
+                                  self.static_other_scale)
+            kl = normal.kl_divergence(other_normal)
+
+            fetch_list = [sample, entropy, log_prob, probs, kl]
+
+        feed_vars = {
+            'loc': self.loc_np,
+            'scale': self.scale_np,
+            'values': self.values_np,
+            'other_loc': self.other_loc_np,
+            'other_scale': self.other_scale_np
+        }
+
+        self.executor.run(fluid.default_startup_program())
+        fetch_list = self.executor.run(program=self.test_program,
+                                       feed=feed_vars,
+                                       fetch_list=fetch_list)
+
+        self.compare_with_numpy(fetch_list)
+
+
+class NormalTest2(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc ans scale are 'int'
+        self.loc_np = int((np.random.ranf() - 0.5) * 8)
+        self.scale_np = int((np.random.ranf() - 0.5) * 8)
+        while self.scale_np < 0:
+            self.scale_np = int((np.random.ranf() - 0.5) * 8)
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = int((np.random.ranf() - 0.5) * 8)
+        self.other_scale_np = int((np.random.ranf() - 0.5) * 8)
+        while self.other_scale_np < 0:
+            self.other_scale_np = int((np.random.ranf() - 0.5) * 8)
+        self.values_np = np.random.ranf(1).astype('float32')
+
+
+class NormalTest3(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # test broadcast: loc is float, scale is numpy.ndarray with dtype 'float32'.
+        self.loc_np = (np.random.ranf() - 0.5) * 4
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = (np.random.ranf() - 0.5) * 4
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest4(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are numpy.ndarray with dtype 'float32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest5(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are numpy.ndarray with dtype 'float64'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = self.loc_np
+        self.dynamic_scale = self.scale_np
+        self.dynamic_other_loc = self.other_loc_np
+        self.dynamic_other_scale = self.other_scale_np
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class NormalTest6(NormalTest):
+    def init_data(self, batch_size=2, dims=3):
+        # loc and scale are Tensor with dtype 'VarType.FP32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        self.loc = paddle.to_tensor(self.loc_np)
+        self.scale = paddle.to_tensor(self.scale_np)
+        self.values = paddle.to_tensor(self.values_np)
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+        self.other_loc = paddle.to_tensor(self.other_loc_np)
+        self.other_scale = paddle.to_tensor(self.other_scale_np)
+
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np)
+        self.dynamic_scale = paddle.to_tensor(self.scale_np)
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+        self.dynamic_other_loc = paddle.to_tensor(self.other_loc_np)
+        self.dynamic_other_scale = paddle.to_tensor(self.other_scale_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float32')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float32')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float32')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float32')
+
+
+class NormalTest7(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP64'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
+        self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+        self.dynamic_other_loc = paddle.to_tensor(
+            self.other_loc_np, dtype='float64')
+        self.dynamic_other_scale = paddle.to_tensor(
+            self.other_scale_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float64')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float64')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float64')
+
+
+class NormalTest8(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
+        self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+        self.dynamic_other_loc = paddle.to_tensor(
+            self.other_loc_np, dtype='float64')
+        self.dynamic_other_scale = paddle.to_tensor(
+            self.other_scale_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float64')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float64')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float64')
 
 
 class DistributionTestError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index 97137e91aa75975d189342f9ad6b9e1caf5b2346..7b9e25e1d4ae8dbb8e4a03d93a7d9c0f9dd18ea6 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -43,7 +43,7 @@ class TestDropoutOp(OpTest):
 class TestDropoutOpInput1d(OpTest):
     def setUp(self):
         self.op_type = "dropout"
-        self.inputs = {'X': np.random.random((2000)).astype("float32")}
+        self.inputs = {'X': np.random.random((2000, )).astype("float32")}
         self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
@@ -672,9 +672,11 @@ class TestAlphaDropoutFAPI(unittest.TestCase):
             res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
             res2 = paddle.nn.functional.alpha_dropout(
                 x=input, p=0., training=False)
+            res3 = paddle.nn.functional.alpha_dropout(x=input, p=1.)
 
             in_np = np.random.random([40, 40]).astype("float32")
             res_np = in_np
+            res_np3 = np.zeros_like(in_np)
 
             exe = fluid.Executor(place)
             res_list = [res1, res2]
@@ -683,6 +685,10 @@ class TestAlphaDropoutFAPI(unittest.TestCase):
                                   feed={"input": in_np},
                                   fetch_list=[res])
                 self.assertTrue(np.allclose(fetches[0], res_np))
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": in_np},
+                              fetch_list=[res3])
+            self.assertTrue(np.allclose(fetches[0], res_np3))
 
     def test_static(self):
         for place in self.places:
@@ -693,15 +699,18 @@ class TestAlphaDropoutFAPI(unittest.TestCase):
             with fluid.dygraph.guard(place):
                 in_np = np.random.random([40, 40]).astype("float32")
                 res_np = in_np
+                res_np3 = np.zeros_like(in_np)
                 input = fluid.dygraph.to_variable(in_np)
 
                 res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
                 res2 = paddle.nn.functional.alpha_dropout(
                     x=input, p=0., training=False)
+                res3 = paddle.nn.functional.alpha_dropout(x=input, p=1.)
 
             res_list = [res1, res2]
             for res in res_list:
                 self.assertTrue(np.allclose(res.numpy(), res_np))
+            self.assertTrue(np.allclose(res3.numpy(), res_np3))
 
 
 class TestAlphaDropoutFAPIError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
index 466226c53fabbd315acd19c6421f210d0ca225c1..a963c2ece0958048b5f0c850184a0930022e6671 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -121,6 +121,9 @@ class TestDygraphWeightNorm(unittest.TestCase):
         before_weight = linear.weight.numpy()
         if self.dim == None:
             self.dim = -1
+
+        if self.dim != -1:
+            self.dim = (self.dim + len(before_weight)) % len(before_weight)
         wn = weight_norm(linear, dim=self.dim)
         outputs = []
         for name, data in self.data.items():
@@ -158,6 +161,13 @@ class TestDygraphWeightNormCase3(TestDygraphWeightNorm):
         self.dim = 3
 
 
+class TestDygraphWeightNormCase4(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = -3
+
+
 class TestDygraphRemoveWeightNorm(unittest.TestCase):
     def setUp(self):
         self.init_test_case()
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index b4359fc69ae18b45774af0d2b20c1540bd99da5c..698f914f89984d8c09619a46c6a6b292b00aac9a 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -25,6 +25,7 @@ import bisect
 import numpy as np
 
 fluid.default_startup_program().random_seed = 1
+np.random.seed(1)
 
 
 class TestDyRnnStaticInput(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
index f5d8b4f704da8acd97475444346522f63d3724fd..cab6160d761004877896deea8d44ca02c9de2e1e 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -220,6 +220,14 @@ class TestRemainderAPI(unittest.TestCase):
                 z_expected = np.array([0, 1, 1, -1])
                 self.assertEqual(np.allclose(z_expected, z.numpy()), True)
 
+                np_x = np.array([-3, 3])
+                np_y = np.array([[2, 3], [-2, -1]])
+                x = paddle.to_tensor(np_x, dtype="int64")
+                y = paddle.to_tensor(np_y, dtype="int64")
+                z = x % y
+                z_expected = np.array([[1, 0], [-1, 0]])
+                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 3475320eeebc55a14dd569410610b70ae35e65a3..43069470680c7d49071ce54bf3649962c56f06ea 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -350,6 +350,14 @@ class TestFillConstantOpError(unittest.TestCase):
                 dtype='int16',
                 out=x1)
 
+            self.assertRaises(
+                TypeError,
+                fluid.layers.fill_constant,
+                shape=[1.1],
+                value=5,
+                dtype='float32',
+                out=x1)
+
             # The argument dtype of fill_constant_op must be one of bool, float16,
             #float32, float64, int32 or int64
             x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32")
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_auto.py b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..020f2f4db382ef1277167d85917e8fdba9c83893
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestDistributedStrategyAuto(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_distributed_strategy_auto(self):
+        fleet.init(is_collective=True)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 9e651dea24ba7f35f3785093da8ac73dde07be5a..4ced9841ee43e02a3d1e3f292bf97200dec29f5c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -18,6 +18,7 @@ import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import os
 import paddle.fluid as fluid
+import numpy as np
 
 
 class TestFleetBase(unittest.TestCase):
@@ -125,5 +126,110 @@ class TestFleetBase(unittest.TestCase):
         self.assertRaises(Exception, fleet.init_worker)
 
 
+class TestFleetDygraph(unittest.TestCase):
+    def setUp(self):
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213,127.0.0.1:36214"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def test_dygraph_method(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = fluid.dygraph.to_variable(value)
+        layer = paddle.nn.Linear(13, 5)
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.01, parameters=layer.parameters())
+        # remove init cause this UT cannot launch distributed task
+        adam = fleet.distributed_optimizer(adam)
+        dp_layer = fleet.distributed_model(layer)
+        lr = 0.001
+        adam.set_lr(lr)
+        cur_lr = adam.get_lr()
+        assert (lr == cur_lr)
+        state_dict = adam.state_dict()
+        adam.set_state_dict(state_dict)
+
+
+class TestFleetBaseSingleRunCollective(unittest.TestCase):
+    def setUp(self):
+        os.environ.pop("PADDLE_TRAINER_ENDPOINTS")
+
+    def gen_data(self):
+        return {
+            "x": np.random.random(size=(128, 32)).astype('float32'),
+            "y": np.random.randint(
+                2, size=(128, 1)).astype('int64')
+        }
+
+    def test_single_run_collective_minimize(self):
+        input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
+        input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+        fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+        cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+        avg_cost = paddle.mean(x=cost)
+
+        fleet.init(is_collective=True)
+        optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        optimizer.minimize(avg_cost)
+
+        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        exe = fluid.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+
+        for i in range(10):
+            cost_val = exe.run(feed=self.gen_data(), fetch_list=[avg_cost.name])
+            print("cost of step[{}] = {}".format(i, cost_val))
+
+
+class TestFleetBaseSingleRunPS(unittest.TestCase):
+    def setUp(self):
+        os.environ.pop("PADDLE_PSERVERS_IP_PORT_LIST")
+
+    def gen_data(self):
+        return {
+            "x": np.random.random(size=(128, 32)).astype('float32'),
+            "y": np.random.randint(
+                2, size=(128, 1)).astype('int64')
+        }
+
+    def test_single_run_ps_minimize(self):
+        input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
+        input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+        fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+        cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+        avg_cost = paddle.mean(x=cost)
+
+        fleet.init()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+        if fleet.is_server():
+            fleet.init_server()
+            fleet.run_server()
+        elif fleet.is_worker():
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            step = 100
+            for i in range(step):
+                cost_val = exe.run(program=fluid.default_main_program(),
+                                   feed=self.gen_data(),
+                                   fetch_list=[avg_cost.name])
+                print("worker_index: %d, step%d cost = %f" %
+                      (fleet.worker_index(), i, cost_val[0]))
+            fleet.save_persistables(exe, "fleet_single_model/")
+            print("save fleet models done.")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index fc668ce3493e96e0790af522a439367fe10455f3..dddc6811ef08bdf8504cb6b4fe09813336875b10 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -239,24 +239,24 @@ class TestGaussianRandomAPI(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
-            paddle.tensor.random.gaussian_random([2, 3])
+            paddle.tensor.random.gaussian([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
-            out = paddle.tensor.random.gaussian_random([2, 3])
+            out = paddle.tensor.random.gaussian([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
-            out = paddle.tensor.random.gaussian_random([2, 3])
+            out = paddle.tensor.random.gaussian([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
@@ -265,24 +265,24 @@ class TestStandardNormalDtype(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
             paddle.tensor.random.standard_normal([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
             out = paddle.tensor.random.standard_normal([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
             out = paddle.tensor.random.standard_normal([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 74cfeab601b04d9624a5f6e48fd06c6cbf3715f8..22f16287c33f96a43361b5fe4ed5d0fe3edbb1bc 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -652,7 +652,7 @@ class TestDygraphUtils(unittest.TestCase):
         a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
         helper = LayerHelper(fluid.unique_name.generate("test"), act="relu")
         func = helper.append_activation
-        with fluid.dygraph.guard():
+        with fluid.dygraph.guard(fluid.core.CPUPlace()):
             a = fluid.dygraph.to_variable(a_np)
             fluid.set_flags({'FLAGS_use_mkldnn': True})
             try:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 48aea3a584dd25667704b22d99d1074c481bb76c..22e19efcb58d19c41835565de2c8c01fe253702a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -374,8 +374,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 adam._learning_rate.step_num = 0
 
             para_state_dict, opti_state_dict = paddle.load("./test_dy")
-            print(opti_state_dict['LR_Scheduler'])
-            adam.set_dict(opti_state_dict)
+            adam.set_state_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
@@ -393,7 +392,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 var.set(np.zeros_like(np_t), place)
 
-            ptb_model.set_dict(para_state_dict)
+            ptb_model.set_state_dict(stat_dict=para_state_dict)
 
             state_dict = ptb_model.state_dict()
 
@@ -483,7 +482,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
 
-            adam.set_dict(self.opti_dict)
+            adam.set_state_dict(self.opti_dict)
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
                 if isinstance(v, core.VarBase):
@@ -500,7 +499,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 var.set(np.zeros_like(np_t), place)
 
-            ptb_model.set_dict(self.state_dict)
+            ptb_model.set_state_dict(self.state_dict)
 
             state_dict = ptb_model.state_dict()
 
@@ -593,7 +592,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
 
-            adam.set_dict(np_opti_dict)
+            adam.set_state_dict(np_opti_dict)
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
@@ -613,7 +612,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 var.set(np.zeros_like(np_t), place)
 
-            ptb_model.set_dict(np_state_dict)
+            ptb_model.set_state_dict(np_state_dict)
 
             state_dict = ptb_model.state_dict()
 
@@ -656,8 +655,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             last_hidden = None
             last_cell = None
 
-            adam.set_dict(self.opti_dict)
-            ptb_model.set_dict(self.state_dict)
+            adam.set_state_dict(self.opti_dict)
+            ptb_model.set_state_dict(self.state_dict)
 
             for i in range(1):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -745,8 +744,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             last_cell = None
 
             state_dict, opti_dict = fluid.load_dygraph("./test_dy")
-            adam.set_dict(opti_dict)
-            ptb_model.set_dict(state_dict)
+            adam.set_state_dict(opti_dict)
+            ptb_model.set_state_dict(state_dict)
 
             for i in range(1):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -849,8 +848,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             for k, v in self.state_dict.items():
                 np_state_dict[k] = v.numpy()
 
-            adam.set_dict(np_opti_dict)
-            ptb_model.set_dict(np_state_dict)
+            adam.set_state_dict(np_opti_dict)
+            ptb_model.set_state_dict(np_state_dict)
             for i in range(1):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
                 y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
@@ -912,6 +911,22 @@ class TestDygraphPtbRnn(unittest.TestCase):
             para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
+    def test_load_compatible_with_keep_name_table(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy'), True)
+            self.assertTrue(para_state_dict != None)
+            self.assertTrue(opti_state_dict == None)
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy'), keep_name_table=True)
+            self.assertTrue(para_state_dict != None)
+            self.assertTrue(opti_state_dict == None)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 3ccd1dbda3a443d50e43ba498cb3d5b529318c32..3eb413a62664057c56567d5834b216110fac04fb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -918,6 +918,29 @@ class TestDygraphPtbRnn(unittest.TestCase):
             para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
+    def test_no_state_in_input_dict(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, _ = paddle.load(os.path.join('saved_dy', 'emb_dy'))
+            para_state_dict.pop('weight')
+
+            emb.set_state_dict(para_state_dict)
+
+    def test_state_shape_mismatch(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, _ = paddle.load(os.path.join('saved_dy', 'emb_dy'))
+            para_state_dict['weight'] = np.expand_dims(
+                para_state_dict['weight'], axis=-1)
+
+            emb.set_state_dict(para_state_dict)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 87b6e76a6d0ab7f5fba7c4526734d81475e1540e..f7fcc1ff561b90dc1b78a67ffbe7c047ed06d0e9 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -183,25 +183,6 @@ class TestJitSaveLoad(unittest.TestCase):
         with self.assertRaises(ValueError):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
-    def test_load_dygraph_no_var_info(self):
-        model_path = "model.test_jit_save_load.no_var_info"
-        self.train_and_save_model(model_path=model_path)
-        # remove `__variables.info__`
-        var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
-        os.remove(var_info_path)
-        new_layer = LinearNet(784, 1)
-        with self.assertRaises(RuntimeError):
-            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
-
-    def test_load_dygraph_not_var_file(self):
-        model_path = "model.test_jit_save_load.no_var_file"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.params_filename = "__params__"
-        self.train_and_save_model(model_path=model_path, configs=configs)
-        new_layer = LinearNet(784, 1)
-        with self.assertRaises(RuntimeError):
-            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
-
 
 class LinearNetMultiInput(fluid.dygraph.Layer):
     def __init__(self, in_size, out_size):
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
index 53e8b02081ae3acf8a7fb5dd2bc6e05cbc3be901..c9948edad0061012cf028bec674a4bb713364541 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
@@ -293,7 +293,7 @@ class TestLinearInterpOpAPI2_0(unittest.TestCase):
 
         # dygraph 
         x_data = np.random.random((1, 3, 128)).astype("float32")
-        us_1 = paddle.nn.UpSample(
+        us_1 = paddle.nn.Upsample(
             size=[64, ],
             mode='linear',
             align_mode=1,
@@ -385,19 +385,19 @@ class TestLinearInterpOpError(unittest.TestCase):
 
             def input_shape_error():
                 x1 = fluid.data(name="x1", shape=[1], dtype="float32")
-                out1 = paddle.nn.UpSample(
+                out1 = paddle.nn.Upsample(
                     size=[256, ], data_format='NCW', mode='linear')
                 out1_res = out1(x1)
 
             def data_format_error():
                 x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
-                out2 = paddle.nn.UpSample(
+                out2 = paddle.nn.Upsample(
                     size=[256, ], data_format='NHWCD', mode='linear')
                 out2_res = out2(x2)
 
             def out_shape_error():
                 x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
-                out3 = paddle.nn.UpSample(
+                out3 = paddle.nn.Upsample(
                     size=[
                         256,
                         256,
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
index 04b56677fc158583fe79ec0dc1276210bd2ebbdc..b34989f5f5c79dfd27158f120175824389ac9731 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
@@ -26,6 +26,7 @@ from paddle.nn.functional import interpolate
 
 def linear_interp_np(input,
                      out_w,
+                     scale_w=0,
                      out_size=None,
                      actual_shape=None,
                      align_corners=True,
@@ -44,7 +45,10 @@ def linear_interp_np(input,
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_w))
 
@@ -81,6 +85,7 @@ class TestLinearInterpOp(OpTest):
         self.op_type = "linear_interp_v2"
         input_np = np.random.random(self.input_shape).astype("float64")
 
+        scale_w = 0
         if self.data_layout == "NCHW":
             in_w = self.input_shape[2]
         else:
@@ -95,7 +100,7 @@ class TestLinearInterpOp(OpTest):
         else:
             out_w = self.out_w
 
-        output_np = linear_interp_np(input_np, out_w, self.out_size,
+        output_np = linear_interp_np(input_np, out_w, self.scale, self.out_size,
                                      self.actual_shape, self.align_corners,
                                      self.align_mode, self.data_layout)
         self.inputs = {'X': input_np}
@@ -195,7 +200,7 @@ class TestLinearInterpOpSizeTensor(TestLinearInterpOp):
         else:
             out_w = self.out_w
 
-        output_np = linear_interp_np(input_np, out_w, self.out_size,
+        output_np = linear_interp_np(input_np, out_w, 0, self.out_size,
                                      self.actual_shape, self.align_corners,
                                      self.align_mode, self.data_layout)
 
@@ -309,7 +314,7 @@ class TestLinearInterpOpAPI2_0(unittest.TestCase):
 
         # dygraph 
         x_data = np.random.random((1, 3, 128)).astype("float32")
-        us_1 = paddle.nn.UpSample(
+        us_1 = paddle.nn.Upsample(
             size=[64, ],
             mode='linear',
             align_mode=1,
@@ -342,7 +347,7 @@ class TestResizeLinearOpUint8(OpTest):
         else:
             out_w = self.out_w
 
-        output_np = linear_interp_np(input_np, out_w, self.out_size,
+        output_np = linear_interp_np(input_np, out_w, 0, self.out_size,
                                      self.actual_shape, self.align_corners,
                                      self.align_mode)
         self.inputs = {'X': input_np}
@@ -410,19 +415,19 @@ class TestLinearInterpOpError(unittest.TestCase):
 
             def input_shape_error():
                 x1 = fluid.data(name="x1", shape=[1], dtype="float32")
-                out1 = paddle.nn.UpSample(
+                out1 = paddle.nn.Upsample(
                     size=[256, ], data_format='NCW', mode='linear')
                 out1_res = out1(x1)
 
             def data_format_error():
                 x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
-                out2 = paddle.nn.UpSample(
+                out2 = paddle.nn.Upsample(
                     size=[256, ], data_format='NHWCD', mode='linear')
                 out2_res = out2(x2)
 
             def out_shape_error():
                 x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
-                out3 = paddle.nn.UpSample(
+                out3 = paddle.nn.Upsample(
                     size=[
                         256,
                         256,
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
index 6d1f42111eebff0f469317ddf2a9ec7698a7ae1e..03cb84ec99e0259a33a086c3d3e5a71abea09d2b 100644
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
@@ -154,16 +154,16 @@ class TestLinspaceOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_step_dtype)
 
             def test_start_dtype():
-                start = fluid.data(shape=[1], dtype="int32", name="start")
+                start = fluid.data(shape=[1], dtype="float64", name="start")
                 fluid.layers.linspace(start, 10, 1, dtype="float32")
 
-            self.assertRaises(TypeError, test_start_dtype)
+            self.assertRaises(ValueError, test_start_dtype)
 
             def test_end_dtype():
-                end = fluid.data(shape=[1], dtype="int32", name="end")
+                end = fluid.data(shape=[1], dtype="float64", name="end")
                 fluid.layers.linspace(0, end, 1, dtype="float32")
 
-            self.assertRaises(TypeError, test_end_dtype)
+            self.assertRaises(ValueError, test_end_dtype)
 
             def test_num_dtype():
                 num = fluid.data(shape=[1], dtype="int32", name="step")
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed1939dbe279f28883d9e33178f1cfa256140e33
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from test_imperative_base import new_program_scope
+
+
+def convolutional_neural_network(img):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return prediction
+
+
+def static_train_net(img, label):
+    prediction = convolutional_neural_network(img)
+
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+
+    optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    optimizer.minimize(avg_loss)
+
+    return prediction, avg_loss
+
+
+class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
+    def setUp(self):
+        self.seed = 90
+        self.epoch_num = 1
+        self.batch_size = 128
+        self.batch_num = 10
+
+    def train_and_save_model(self):
+        with new_program_scope():
+            startup_program = fluid.default_startup_program()
+            main_program = fluid.default_main_program()
+
+            img = fluid.data(
+                name='img', shape=[None, 1, 28, 28], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+
+            prediction, avg_loss = static_train_net(img, label)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+
+            exe = fluid.Executor(place)
+
+            feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+            exe.run(startup_program)
+
+            train_reader = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.mnist.train(), buf_size=100),
+                batch_size=self.batch_size)
+
+            for _ in range(0, self.epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    exe.run(main_program,
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_loss])
+
+                    if batch_id > self.batch_num:
+                        break
+
+            static_param_dict = {}
+            for param in fluid.default_main_program().all_parameters():
+                static_param_dict[param.name] = fluid.executor._fetch_var(
+                    param.name)
+
+            fluid.io.save_inference_model(
+                self.save_dirname, ["img"], [prediction],
+                exe,
+                model_filename=self.model_filename,
+                params_filename=self.params_filename)
+
+        return static_param_dict
+
+    def check_load_state_dict(self, orig_dict, load_dict):
+        for var_name, value in six.iteritems(orig_dict):
+            self.assertTrue(np.array_equal(value, load_dict[var_name]))
+
+    def test_load_default(self):
+        self.save_dirname = "static_mnist.load_state_dict.default"
+        self.model_filename = None
+        self.params_filename = None
+        orig_param_dict = self.train_and_save_model()
+
+        configs = paddle.SaveLoadConfig()
+        configs.separate_params = True
+        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+    def test_load_with_model_filename(self):
+        self.save_dirname = "static_mnist.load_state_dict.model_filename"
+        self.model_filename = "static_mnist.model"
+        self.params_filename = None
+        orig_param_dict = self.train_and_save_model()
+
+        configs = paddle.SaveLoadConfig()
+        configs.separate_params = True
+        configs.model_filename = self.model_filename
+        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+    def test_load_with_param_filename(self):
+        self.save_dirname = "static_mnist.load_state_dict.param_filename"
+        self.model_filename = None
+        self.params_filename = "static_mnist.params"
+        orig_param_dict = self.train_and_save_model()
+
+        configs = paddle.SaveLoadConfig()
+        configs.params_filename = self.params_filename
+        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+    def test_load_with_model_and_param_filename(self):
+        self.save_dirname = "static_mnist.load_state_dict.model_and_param_filename"
+        self.model_filename = "static_mnist.model"
+        self.params_filename = "static_mnist.params"
+        orig_param_dict = self.train_and_save_model()
+
+        configs = paddle.SaveLoadConfig()
+        configs.params_filename = self.params_filename
+        configs.model_filename = self.model_filename
+        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 90430bbce4d1896c8fdbb829230f2ad8a691adff..1f3dab67f2afe4e2b0a655634bb808ad0951ae9e 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import math
 
 import paddle.fluid.core as core
 from op_test import OpTest
@@ -27,120 +28,372 @@ SIGMOID_THRESHOLD_MAX = 13.0
 EXP_MAX_INPUT = 40.0
 
 
-def lstm_naive(input, w):
-    seq_len, batch_size, hidden_size = input.shape
-
-    offset = 0
-    wi = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    wf = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    wc = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    wo = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    ri = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    rf = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    rc = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    ro = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-
-    bi_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bf_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bc_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bo_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-
-    bi_2 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bf_2 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bc_2 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bo_2 = w[offset:offset + hidden_size]
-
-    def sigmoid(x):
-        y = np.copy(x)
-        y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
-        y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
-        return 1. / (1. + np.exp(-y))
-
-    def tanh(x):
-        y = -2. * x
-        y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
-        return (2. / (1. + np.exp(y))) - 1.
-
-    output = []
-    pre_h = np.zeros((1, batch_size, hidden_size), dtype=input.dtype)
-    pre_c = np.zeros((1, batch_size, hidden_size), dtype=input.dtype)
-
-    for i in range(seq_len):
-        emb_1 = input[i]
-
-        input_gate = sigmoid(
-            np.matmul(emb_1, wi) + np.matmul(pre_h, ri) + bi_1 + bi_2)
-        forget_gate = sigmoid(
-            np.matmul(emb_1, wf) + np.matmul(pre_h, rf) + bf_1 + bf_2)
-        output_gate = sigmoid(
-            np.matmul(emb_1, wo) + np.matmul(pre_h, ro) + bo_1 + bo_2)
-        c_t_temp = tanh(
-            np.matmul(emb_1, wc) + np.matmul(pre_h, rc) + bc_1 + bc_2)
-        new_c = input_gate * c_t_temp + forget_gate * pre_c
-        new_h = output_gate * tanh(new_c)
-
-        pre_h = new_h
-        pre_c = new_c
-
-        output.append(new_h)
-
-    output = np.concatenate(output, -1)
-    output = output.reshape((batch_size, -1, hidden_size))
-    output = output.transpose((1, 0, 2))
-
-    return output, pre_h, pre_c
+class LayerMixin(object):
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+
+class LayerListMixin(LayerMixin):
+    def __init__(self, layers=None):
+        self._layers = list(layers) if layers else []
+
+    def append(self, layer):
+        self._layers.append(layer)
+
+    def __iter__(self):
+        return iter(self._layers)
+
+
+class LSTMCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.dtype = np.float64
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.ones(
+            (4 * hidden_size, input_size), dtype=self.dtype)
+        self.weight_hh = np.ones((4 * hidden_size,
+                                  hidden_size)).astype(self.dtype)
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.ones((4 * hidden_size)).astype(self.dtype)
+            self.bias_hh = np.ones((4 * hidden_size)).astype(self.dtype)
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        init_h = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        init_c = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        return init_h, init_c
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_hidden, pre_cell = hx
+        gates = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            gates = gates + self.bias_ih
+        gates += np.matmul(pre_hidden, self.weight_hh.T)
+        if self.bias_hh is not None:
+            gates = gates + self.bias_hh
+
+        chunked_gates = np.split(gates, 4, -1)
+
+        i = 1.0 / (1.0 + np.exp(-chunked_gates[0]))
+        f = 1.0 / (1.0 + np.exp(-chunked_gates[1]))
+        o = 1.0 / (1.0 + np.exp(-chunked_gates[3]))
+        c = f * pre_cell + i * np.tanh(chunked_gates[2])
+        h = o * np.tanh(c)
+
+        return h, (h, c)
+
+
+def sequence_mask(lengths, max_len=None):
+    if max_len is None:
+        max_len = np.max(lengths)
+    else:
+        assert max_len >= np.max(lengths)
+    return np.arange(max_len) < np.expand_dims(lengths, -1)
+
+
+def update_state(mask, new, old):
+    if not isinstance(old, (tuple, list)):
+        return np.where(mask, new, old)
+    else:
+        return tuple(map(lambda x, y: np.where(mask, x, y), new, old))
+
+
+def rnn(cell,
+        inputs,
+        initial_states,
+        sequence_length=None,
+        time_major=False,
+        is_reverse=False):
+    if not time_major:
+        inputs = np.transpose(inputs, [1, 0, 2])
+    if is_reverse:
+        inputs = np.flip(inputs, 0)
+
+    if sequence_length is None:
+        mask = None
+    else:
+        mask = np.transpose(sequence_mask(sequence_length), [1, 0])
+        mask = np.expand_dims(mask, -1)
+        if is_reverse:
+            mask = np.flip(mask, 0)
+
+    time_steps = inputs.shape[0]
+    state = initial_states
+    outputs = []
+    for t in range(time_steps):
+        x_t = inputs[t]
+        if mask is not None:
+            m_t = mask[t]
+            y, new_state = cell(x_t, state)
+            y = np.where(m_t, y, 0.)
+            outputs.append(y)
+            state = update_state(m_t, new_state, state)
+        else:
+            y, new_state = cell(x_t, state)
+            outputs.append(y)
+            state = new_state
+
+    outputs = np.stack(outputs)
+    final_state = state
+
+    if is_reverse:
+        outputs = np.flip(outputs, 0)
+    if not time_major:
+        outputs = np.transpose(outputs, [1, 0, 2])
+    return outputs, final_state
+
+
+def birnn(cell_fw,
+          cell_bw,
+          inputs,
+          initial_states,
+          sequence_length=None,
+          time_major=False):
+    states_fw, states_bw = initial_states
+    outputs_fw, states_fw = rnn(cell_fw,
+                                inputs,
+                                states_fw,
+                                sequence_length,
+                                time_major=time_major)
+
+    outputs_bw, states_bw = rnn(cell_bw,
+                                inputs,
+                                states_bw,
+                                sequence_length,
+                                time_major=time_major,
+                                is_reverse=True)
+
+    outputs = np.concatenate((outputs_fw, outputs_bw), -1)
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
+
+
+def flatten(nested):
+    return list(_flatten(nested))
+
+
+def _flatten(nested):
+    for item in nested:
+        if isinstance(item, (list, tuple)):
+            for subitem in _flatten(item):
+                yield subitem
+        else:
+            yield item
+
+
+def unstack(array, axis=0):
+    num = array.shape[axis]
+    sub_arrays = np.split(array, num, axis)
+    return [np.squeeze(sub_array, axis) for sub_array in sub_arrays]
+
+
+def dropout(array, p=0.0):
+    if p == 0.0:
+        return array
+
+    mask = (np.random.uniform(size=array.shape) < (1 - p)).astype(array.dtype)
+    return array * (mask / (1 - p))
+
+
+def split_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        states = unstack(states)
+        if not bidirectional:
+            return states
+        else:
+            return list(zip(states[::2], states[1::2]))
+    else:
+        assert len(states) == state_components
+        states = tuple([unstack(item) for item in states])
+        if not bidirectional:
+            return list(zip(*states))
+        else:
+            states = list(zip(*states))
+            return list(zip(states[::2], states[1::2]))
+
+
+def concat_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        return np.stack(flatten(states))
+    else:
+        states = flatten(states)
+        componnets = []
+        for i in range(state_components):
+            componnets.append(states[i::state_components])
+        return [np.stack(item) for item in componnets]
+
+
+class RNN(LayerMixin):
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            # for non-dygraph mode, `rnn` api uses cell.call
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        final_outputs, final_states = rnn(self.cell,
+                                          inputs,
+                                          initial_states=initial_states,
+                                          sequence_length=sequence_length,
+                                          time_major=self.time_major,
+                                          is_reverse=self.is_reverse)
+        return final_outputs, final_states
+
+
+class BiRNN(LayerMixin):
+    def __init__(self, cell_fw, cell_bw, time_major=False):
+        super(BiRNN, self).__init__()
+        self.cell_fw = cell_fw
+        self.cell_bw = cell_bw
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if isinstance(initial_states, (list, tuple)):
+            assert len(initial_states) == 2, \
+                "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+
+        outputs, final_states = birnn(self.cell_fw, self.cell_bw, inputs,
+                                      initial_states, sequence_length,
+                                      self.time_major)
+        return outputs, final_states
+
+
+class RNNMixin(LayerListMixin):
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        batch_size = inputs.shape[batch_index]
+        dtype = inputs.dtype
+        if initial_states is None:
+            state_shape = (self.num_layers * self.num_directions, batch_size,
+                           self.hidden_size)
+            if self.state_components == 1:
+                initial_states = np.zeros(state_shape, dtype)
+            else:
+                initial_states = tuple([
+                    np.zeros(state_shape, dtype)
+                    for _ in range(self.state_components)
+                ])
+
+        states = split_states(initial_states, self.num_directions == 2,
+                              self.state_components)
+        final_states = []
+
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = dropout(inputs, self.dropout)
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
+
+        final_states = concat_states(final_states, self.num_directions == 2,
+                                     self.state_components)
+        return outputs, final_states
+
+
+class LSTM(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(LSTM, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = LSTMCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = LSTMCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = LSTMCell(input_size, hidden_size)
+            cell_bw = LSTMCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 2
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNLstmOp(OpTest):
-    # TODO(GaoWei8):when input dtype is fp64, precision threshold should be removed.
+    #TODO(GaoWei8): Need to satisfy the result through the new interface
     def setUp(self):
         self.op_type = "cudnn_lstm"
         self.dtype = np.float64
+        self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
+        self.num_layers = 1
 
-        seq_length = 20
+        seq_length = 12
         batch_size = 5
-        hidden_size = 20
+        input_size = 21
+        hidden_size = 21
 
         input_weight_size = (hidden_size * hidden_size) * 4
         hidden_weight_size = (hidden_size * hidden_size) * 4
         weight_size = input_weight_size + hidden_weight_size
         weight_size += hidden_size * 8
+        weight_size *= self.num_layers
 
         input = np.random.uniform(
-            low=-0.1, high=0.1, size=(seq_length, batch_size,
-                                      hidden_size)).astype(self.dtype)
-        flat_w = np.random.uniform(
-            low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype)
-
-        output, last_hidden, last_cell = lstm_naive(input, flat_w)
-
-        init_h = np.zeros((1, batch_size, hidden_size), dtype=np.float64)
-        init_c = np.zeros((1, batch_size, hidden_size), dtype=np.float64)
+            low=-0.1, high=0.1,
+            size=(seq_length, batch_size, input_size)).astype(self.dtype)
+        input[11][1:][:] = 0
+        input[10][2:][:] = 0
+        input[9][3:][:] = 0
+        input[8][4:][:] = 0
+
+        rnn1 = LSTM(
+            input_size,
+            hidden_size,
+            self.num_layers,
+            time_major=True,
+            direction="forward")
+
+        output, (last_hidden, last_cell) = rnn1(
+            input, sequence_length=self.sequence_length)
+
+        flat_w = np.ones((weight_size)).astype(self.dtype)
+        init_h = np.zeros((self.num_layers, batch_size,
+                           hidden_size)).astype(self.dtype)
+        init_c = np.zeros((self.num_layers, batch_size,
+                           hidden_size)).astype(self.dtype)
         state_out = np.ndarray((300)).astype("uint8")
 
         self.inputs = {
@@ -152,9 +405,10 @@ class TestCUDNNLstmOp(OpTest):
         self.attrs = {
             'dropout_prob': 0.0,
             'is_bidirec': False,
-            'input_size': hidden_size,
+            'input_size': input_size,
             'hidden_size': hidden_size,
             'num_layers': 1,
+            'sequence_length': self.sequence_length.tolist()
         }
         self.outputs = {
             'Out': output,
@@ -164,19 +418,33 @@ class TestCUDNNLstmOp(OpTest):
             'StateOut': state_out
         }
 
+    def set_attrs(self):
+        pass
+
     def test_output_with_place(self):
-        # depend on the scope structure
         place = core.CUDAPlace(0)
         self.check_output_with_place(
             place, no_check_set=['Reserve', 'StateOut'])
 
     def test_grad_with_place(self):
-        # depend on the scope structure
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place,
-            set(['Input', 'W', 'InitH', 'InitC']), ['Out', 'LastH', 'LastC'],
-            max_relative_error=1e-4)
+        self.check_grad_with_place(place,
+                                   set(['Input', 'W', 'InitH', 'InitC']),
+                                   ['Out', 'LastH', 'LastC'])
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNLstmOp2(TestCUDNNLstmOp):
+    def set_attrs(self):
+        self.sequence_length = np.array([], dtype=np.int32)
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNLstmOp3(TestCUDNNLstmOp):
+    def set_attrs(self):
+        self.num_layers = 2
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -198,7 +466,7 @@ class TestCUDNNlstmAPI(unittest.TestCase):
                                       'float64', 0.0)
         rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
                                               hidden_size, num_layers,
-                                              dropout_prob)
+                                              dropout_prob, False, True)
         exe = fluid.Executor(fluid.CUDAPlace(0))
         exe.run(fluid.default_startup_program())
         input_i = np.random.uniform(
@@ -208,12 +476,6 @@ class TestCUDNNlstmAPI(unittest.TestCase):
                       feed={'input': input_i},
                       fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])
 
-        output, last_hidden, last_cell = lstm_naive(input_i, out[3])
-
-        self.assertTrue(np.allclose(output, out[0], atol=1e-5))
-        self.assertTrue(np.allclose(last_hidden, out[1], atol=1e-5))
-        self.assertTrue(np.allclose(last_cell, out[2], atol=1e-5))
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
index 19da09a463f3cc6224a22eb90278abae9ec59b91..2feca1c30689cec20e1d696cc672516414786038 100755
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -26,6 +26,8 @@ import paddle
 def nearest_neighbor_interp_np(X,
                                out_h,
                                out_w,
+                               scale_h=0,
+                               scale_w=0,
                                out_size=None,
                                actual_shape=None,
                                align_corners=True,
@@ -46,13 +48,18 @@ def nearest_neighbor_interp_np(X,
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
     if (out_w > 1):
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
-
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
     out = np.zeros((n, c, out_h, out_w))
 
     if align_corners:
@@ -89,7 +96,8 @@ class TestNearestInterpOp(OpTest):
         else:
             in_h = self.input_shape[1]
             in_w = self.input_shape[2]
-
+        scale_h = 0
+        scale_w = 0
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0:
@@ -106,8 +114,8 @@ class TestNearestInterpOp(OpTest):
             out_w = self.out_w
 
         output_np = nearest_neighbor_interp_np(
-            input_np, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.data_layout)
+            input_np, out_h, out_w, scale_h, scale_w, self.out_size,
+            self.actual_shape, self.align_corners, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -265,7 +273,7 @@ class TestNearestInterpOpUint8(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
                                                self.out_size, self.actual_shape,
                                                self.align_corners)
         self.inputs = {'X': input_np}
@@ -408,7 +416,7 @@ class TestNearestInterpOp_attr_tensor(OpTest):
             if isinstance(self.scale, list) and len(self.scale) == 1:
                 self.scale = [self.scale[0], self.scale[0]]
             self.attrs['scale'] = self.scale
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
                                                self.out_size, self.actual_shape,
                                                self.align_corners)
         self.outputs = {'Out': output_np}
diff --git a/python/paddle/fluid/tests/unittests/test_normal.py b/python/paddle/fluid/tests/unittests/test_normal.py
index 3e6855feaf491727203063f5c75c68301abbe05e..995a1f26ff6eb86c9198a164bcef80bebe3a8e79 100644
--- a/python/paddle/fluid/tests/unittests/test_normal.py
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
@@ -18,6 +18,7 @@ import paddle
 import copy
 
 np.random.seed(10)
+paddle.manual_seed(10)
 
 
 class TestNormalAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index 2e6e516aa2edde79e6524b4b35507ea95876ec53..91d705223316360b8c05954259724a5f7d246440 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -832,8 +832,8 @@ class TestRecomputeOptimizer(unittest.TestCase):
         recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
         recompute_optimizer._set_checkpoints([b1_out])
         try:
-            stat_dict = {}
-            recompute_optimizer.load(stat_dict)
+            state_dict = {}
+            recompute_optimizer.load(state_dict)
         except NotImplementedError as e:
             self.assertEqual(
                 "load function is not supported by Recompute Optimizer for now",
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index bac196b1ab52b604a85321a5473d455d2616bf0d..9cc507aa9b7918e854d56f1c8482f1b875910fb4 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -47,5 +47,21 @@ class TestParallelDygraphMnistSpawn(TestDistSpawnRunner):
             self.check_dist_result_with_spawn(test_class=TestMnist, delta=1e-5)
 
 
+class TestFleetDygraphMnist(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._gpu_fleet_api = True
+
+    def test_mnist(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_mnist.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index 1c05b96f1fc61234028e940f6403ae08a0186027..25216175d59935535a352b02afc3c8f371cedd63 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -143,6 +143,27 @@ class TestPool1d_API(unittest.TestCase):
             result = avg_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_avg_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.avg_pool1d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=[1],
+                count_include_pad=True)
+
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[1], exclusive=False)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool1d_dg = paddle.nn.AvgPool1d(
+                kernel_size=2, stride=None, padding=1, count_include_pad=True)
+            result = avg_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
index 93a2be6de342efc4e8284e7c352137d0a3a1bcb9..91faf78418b0d3a92a3cb6a167b6024b1beb3898 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from test_pool2d_op import adaptive_start_index, adaptive_end_index, pool2D_forward_naive
+from test_pool2d_op import adaptive_start_index, adaptive_end_index, pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive
 import unittest
 from op_test import OpTest
 import numpy as np
@@ -68,6 +68,47 @@ class TestPool2d_API(unittest.TestCase):
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_avg_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(
+                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
+
+            result_np = avg_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[1, 1],
+                ceil_mode=False,
+                exclusive=False)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=1, ceil_mode=False)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_ceilmode_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = avg_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                ceil_mode=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0, ceil_mode=True)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(
@@ -108,6 +149,70 @@ class TestPool2d_API(unittest.TestCase):
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_max_dygraph_nhwc_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(
+                np.transpose(input_np, [0, 2, 3, 1]))
+            result = max_pool2d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                return_indices=False,
+                data_format="NHWC")
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+            self.assertTrue(
+                np.allclose(
+                    np.transpose(result.numpy(), [0, 3, 1, 2]), result_np))
+
+    def check_max_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool2d(
+                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
+
+            result_np = max_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[1, 1],
+                ceil_mode=False,
+                exclusive=False)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=1, ceil_mode=False)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_ceilmode_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool2d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = max_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                ceil_mode=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0, ceil_mode=True)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_dygraph_stride_is_none(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
@@ -215,6 +320,9 @@ class TestPool2d_API(unittest.TestCase):
             self.check_avg_dygraph_stride_is_none(place)
             self.check_max_dygraph_padding(place)
             self.check_avg_divisor(place)
+            self.check_max_dygraph_padding_results(place)
+            self.check_max_dygraph_ceilmode_results(place)
+            self.check_max_dygraph_nhwc_results(place)
 
 
 class TestPool2dError_API(unittest.TestCase):
@@ -370,6 +478,22 @@ class TestPool2dError_API(unittest.TestCase):
 
         self.assertRaises(ValueError, run8)
 
+        def run9():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    ceil_mode=False,
+                    data_format='NHWC',
+                    return_indices=True)
+
+        self.assertRaises(ValueError, run9)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index cc078e9aae7aafe55e937b80270dd012fd64ff70..a77f1cdd57d7bade92e2a4f914dc3d91624d4845 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -22,7 +22,7 @@ import paddle.fluid.core as core
 from op_test import OpTest
 import paddle.fluid as fluid
 from paddle.nn.functional import avg_pool3d, max_pool3d
-from test_pool3d_op import adaptive_start_index, adaptive_end_index, pool3D_forward_naive
+from test_pool3d_op import adaptive_start_index, adaptive_end_index, pool3D_forward_naive, avg_pool3D_forward_naive, max_pool3D_forward_naive
 
 
 class TestPool3d_API(unittest.TestCase):
@@ -73,6 +73,58 @@ class TestPool3d_API(unittest.TestCase):
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_avg_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=1,
+                ceil_mode=False,
+                count_include_pad=True)
+
+            result_np = avg_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[1, 1, 1],
+                ceil_mode=False,
+                exclusive=False)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2,
+                stride=None,
+                padding=1,
+                ceil_mode=False,
+                count_include_pad=True)
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_ceilmode_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool3d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = avg_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                ceil_mode=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2, stride=None, padding=0, ceil_mode=True)
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(
@@ -112,6 +164,74 @@ class TestPool3d_API(unittest.TestCase):
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_max_dygraph_ndhwc_results(self, place):
+        print("run ndchw max pool3d")
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(
+                np.transpose(input_np, [0, 2, 3, 4, 1]))
+            result = max_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                data_format="NDHWC",
+                return_indices=False)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            self.assertTrue(
+                np.allclose(
+                    np.transpose(result.numpy(), [0, 4, 1, 2, 3]), result_np))
+
+    def check_max_dygraph_ceilmode_results(self, place):
+        print("run ceil mode max pool3d")
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool3d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = max_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                ceil_mode=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=None, padding=0, ceil_mode=True)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool3d(
+                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
+
+            result_np = max_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[1, 1, 1],
+                ceil_mode=False)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=None, padding=1, ceil_mode=False)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_dygraph_stride_is_none(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
@@ -205,6 +325,8 @@ class TestPool3d_API(unittest.TestCase):
             self.check_max_dygraph_stride_is_none(place)
             self.check_max_dygraph_padding(place)
             self.check_avg_divisor(place)
+            self.check_max_dygraph_ndhwc_results(place)
+            self.check_max_dygraph_ceilmode_results(place)
 
 
 class TestPool3dError_API(unittest.TestCase):
@@ -336,6 +458,21 @@ class TestPool3dError_API(unittest.TestCase):
 
         self.assertRaises(ValueError, run9)
 
+        def run10():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    data_format='NDHWC',
+                    return_indices=True)
+
+        self.assertRaises(ValueError, run10)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rand_op.py b/python/paddle/fluid/tests/unittests/test_rand_op.py
index 1eceeaadfec651ade5031ddc7e6a012244050e84..4b8fe8c7e4786417de2f80dbb9953530781f9189 100644
--- a/python/paddle/fluid/tests/unittests/test_rand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rand_op.py
@@ -120,24 +120,24 @@ class TestRandDtype(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
             paddle.tensor.random.rand([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
             out = paddle.tensor.random.rand([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
             out = paddle.tensor.random.rand([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 88b07f5df83f8f967f8ba76e78b37ecfb2c54276..7880b48cd7d5a006d78b836be3d9d2f0b1e04c5e 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -58,6 +58,11 @@ class TestRandintOpError(unittest.TestCase):
             self.assertRaises(TypeError, paddle.randint, 5, dtype='float32')
             self.assertRaises(ValueError, paddle.randint, 5, 5)
             self.assertRaises(ValueError, paddle.randint, -5)
+            self.assertRaises(TypeError, paddle.randint, 5, shape=['2'])
+            shape_tensor = paddle.static.data('X', [1])
+            self.assertRaises(TypeError, paddle.randint, 5, shape=shape_tensor)
+            self.assertRaises(
+                TypeError, paddle.randint, 5, shape=[shape_tensor])
 
 
 class TestRandintOp_attr_tensorlist(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index cf35f9dbcdaaae1357ccdfd6b5cba85ac98d2037..b0b85f633a2bf613cdbdcc2ba7b31b5d970da8ca 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -475,87 +475,71 @@ class API_TestSumOpError(unittest.TestCase):
     def test_errors(self):
         def test_dtype1():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.sum(data, dtype="int32")
+                data = fluid.data(name="data", shape=[10], dtype="float64")
+                paddle.sum(data, dtype="float32")
 
         self.assertRaises(ValueError, test_dtype1)
 
         def test_dtype2():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.sum(data, dtype="float32")
+                data = fluid.data(name="data", shape=[10], dtype="int64")
+                paddle.sum(data, dtype="int32")
 
         self.assertRaises(ValueError, test_dtype2)
 
         def test_dtype3():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="int32")
-                paddle.sum(data, dtype="bool")
+                data = fluid.data(name="data", shape=[10], dtype="float64")
+                paddle.sum(data, dtype="int32")
 
         self.assertRaises(ValueError, test_dtype3)
 
-        def test_dtype4():
+        def test_type():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 data = fluid.data(name="data", shape=[10], dtype="int32")
-                paddle.sum(data, dtype="int32")
+                paddle.sum(data, dtype="bool")
 
-        self.assertRaises(ValueError, test_dtype3)
+        self.assertRaises(TypeError, test_type)
 
 
 class API_TestSumOp(unittest.TestCase):
-    def test_static(self):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="float32")
-            result_sum = paddle.sum(x=data, axis=1, dtype="float64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.rand(10, 10).astype(np.float32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual(
-            (res == np.sum(input_data.astype(np.float64), axis=1)).all(), True)
+    def run_static(self,
+                   shape,
+                   x_dtype,
+                   attr_axis,
+                   attr_dtype=None,
+                   np_axis=None):
+        if np_axis is None:
+            np_axis = attr_axis
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(x=data, axis=1, dtype="int64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual(
-            (res == np.sum(input_data.astype(np.int64), axis=1)).all(), True)
+            data = fluid.data("data", shape=shape, dtype=x_dtype)
+            result_sum = paddle.sum(x=data, axis=attr_axis, dtype=attr_dtype)
 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(x=data, axis=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
+            exe = fluid.Executor(fluid.CPUPlace())
+            input_data = np.random.rand(*shape).astype(x_dtype)
             res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual((res == np.sum(input_data, axis=1)).all(), True)
 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(x=data, axis=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual((res == np.sum(input_data, axis=1)).all(), True)
+        self.assertTrue(
+            np.allclose(
+                res, np.sum(input_data.astype(attr_dtype), axis=np_axis)))
 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input_data = np.random.randint(10, size=(5, 5, 5)).astype(np.int32)
-            data = fluid.data("data", shape=[5, 5, 5], dtype="int32")
-            sum1 = paddle.sum(x=data, axis=[0, 1])
-            sum2 = paddle.sum(x=data, axis=())
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            res1, res2 = exe.run(feed={"data": input_data},
-                                 fetch_list=[sum1, sum2])
-
-        self.assertEqual((res1 == np.sum(input_data, axis=(0, 1))).all(), True)
-        self.assertEqual(
-            (res2 == np.sum(input_data, axis=(0, 1, 2))).all(), True)
+    def test_static(self):
+        shape = [10, 10]
+        axis = 1
+
+        self.run_static(shape, "int32", axis, attr_dtype=None)
+        self.run_static(shape, "int32", axis, attr_dtype="int32")
+        self.run_static(shape, "int32", axis, attr_dtype="int64")
+
+        self.run_static(shape, "float32", axis, attr_dtype=None)
+        self.run_static(shape, "float32", axis, attr_dtype="float32")
+        self.run_static(shape, "float32", axis, attr_dtype="float64")
+
+        shape = [5, 5, 5]
+        self.run_static(shape, "int32", (0, 1), attr_dtype="int32")
+        self.run_static(
+            shape, "int32", (), attr_dtype="int32", np_axis=(0, 1, 2))
 
     def test_dygraph(self):
         np_x = np.random.random([2, 3, 4]).astype('int32')
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 44087c5421a5ee66273ef35b935926d42dcc37ae..167a8a017c24a01a6475a03835222d33c601396e 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -106,9 +106,9 @@ def bow_net(data,
             label,
             dict_dim,
             is_sparse=False,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
+            emb_dim=8,
+            hid_dim=8,
+            hid_dim2=6,
             class_dim=2):
     """
     BOW net
@@ -132,8 +132,8 @@ class TestRegularizer(unittest.TestCase):
     def setUp(self):
         self.word_dict = paddle.dataset.imdb.word_dict()
         reader = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict), batch_size=8)()
-        self.train_data = [next(reader) for _ in range(5)]
+            paddle.dataset.imdb.train(self.word_dict), batch_size=1)()
+        self.train_data = [next(reader) for _ in range(1)]
 
     def get_places(self):
         places = [core.CPUPlace()]
@@ -245,14 +245,14 @@ class TestRegularizer(unittest.TestCase):
             sgd.minimize(loss)
         with fluid.dygraph.guard():
             input = fluid.dygraph.to_variable(
-                np.random.randn(3, 5).astype('float32'))
+                np.random.randn(3, 2).astype('float32'))
             paddle.manual_seed(1)
             paddle.framework.random._manual_program_seed(1)
 
             linear1 = fluid.dygraph.Linear(
-                5, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
             linear2 = fluid.dygraph.Linear(
-                5, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
 
             loss1 = linear1(input)
             loss1.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py
index fd5c02c55db4c22d9edd604b7998a5405961d596..8dd71c5a558094ce6f259105eeb1aafb834ad6dc 100644
--- a/python/paddle/fluid/tests/unittests/test_stack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_stack_op.py
@@ -182,6 +182,11 @@ class API_test(unittest.TestCase):
             expected_result = np.stack([input1, input2, input3], axis=0)
             self.assertTrue(np.allclose(expected_result, result))
 
+    def test_single_tensor_error(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = paddle.rand([2, 3])
+            self.assertRaises(TypeError, paddle.stack, x)
+
 
 class API_DygraphTest(unittest.TestCase):
     def test_out(self):
@@ -192,18 +197,23 @@ class API_DygraphTest(unittest.TestCase):
             x1 = fluid.dygraph.to_variable(data1)
             x2 = fluid.dygraph.to_variable(data2)
             x3 = fluid.dygraph.to_variable(data3)
-            result = paddle.stack([x1, x2, x3], axis=0)
+            result = paddle.stack([x1, x2, x3])
             result_np = result.numpy()
-        expected_result = np.stack([data1, data2, data3], axis=0)
+        expected_result = np.stack([data1, data2, data3])
         self.assertTrue(np.allclose(expected_result, result_np))
 
         with fluid.dygraph.guard():
             y1 = fluid.dygraph.to_variable(data1)
-            result = paddle.stack(y1, axis=0)
+            result = paddle.stack([y1], axis=0)
             result_np_2 = result.numpy()
-        expected_result_2 = np.stack(data1, axis=0)
+        expected_result_2 = np.stack([data1], axis=0)
         self.assertTrue(np.allclose(expected_result_2, result_np_2))
 
+    def test_single_tensor_error(self):
+        with fluid.dygraph.guard():
+            x = paddle.to_tensor([1, 2, 3])
+            self.assertRaises(Exception, paddle.stack, x)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index 09cd40d9cc59914c82cc343bb78b72fbc2b29e59..1c11e831b0ad31a3c450c70e7f7c258455409d05 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -227,14 +227,15 @@ class TestConvertSyncBatchNorm(unittest.TestCase):
             return
 
         with program_guard(Program(), Program()):
+            compare_model = paddle.nn.Sequential(
+                paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
             model = paddle.nn.Sequential(
                 paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
-            sync_model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
-            for idx, sublayer in enumerate(model.sublayers()):
+            model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            for idx, sublayer in enumerate(compare_model.sublayers()):
                 if isinstance(sublayer, paddle.nn.BatchNorm2d):
                     self.assertEqual(
-                        isinstance(sync_model[idx], paddle.nn.SyncBatchNorm),
-                        True)
+                        isinstance(model[idx], paddle.nn.SyncBatchNorm), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_translated_layer.py b/python/paddle/fluid/tests/unittests/test_translated_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..20c51b9afbafac9ba1fa032aea446383bc2b9796
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_translated_layer.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as opt
+
+BATCH_SIZE = 16
+BATCH_NUM = 4
+EPOCH_NUM = 4
+SEED = 10
+
+IMAGE_SIZE = 784
+CLASS_NUM = 10
+
+
+# define a random dataset
+class RandomDataset(paddle.io.Dataset):
+    def __init__(self, num_samples):
+        self.num_samples = num_samples
+
+    def __getitem__(self, idx):
+        np.random.seed(SEED)
+        image = np.random.random([IMAGE_SIZE]).astype('float32')
+        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+        return image, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class LinearNet(nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        return self._linear(x)
+
+
+def train(layer, loader, loss_fn, opt):
+    for epoch_id in range(EPOCH_NUM):
+        for batch_id, (image, label) in enumerate(loader()):
+            out = layer(image)
+            loss = loss_fn(out, label)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+            print("Epoch {} batch {}: loss = {}".format(epoch_id, batch_id,
+                                                        np.mean(loss.numpy())))
+    return loss
+
+
+class TestTranslatedLayer(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        place = paddle.CPUPlace()
+        paddle.disable_static(place)
+
+        # config seed
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+        # create network
+        self.layer = LinearNet()
+        self.loss_fn = nn.CrossEntropyLoss()
+        self.sgd = opt.SGD(learning_rate=0.001,
+                           parameters=self.layer.parameters())
+
+        # create data loader
+        dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+        self.loader = paddle.io.DataLoader(
+            dataset,
+            places=place,
+            batch_size=BATCH_SIZE,
+            shuffle=True,
+            drop_last=True,
+            num_workers=2)
+
+        # train
+        train(self.layer, self.loader, self.loss_fn, self.sgd)
+
+        # save
+        self.model_path = "linear.example.model"
+        paddle.jit.save(self.layer, self.model_path)
+
+    def test_inference_and_fine_tuning(self):
+        self.load_and_inference()
+        self.load_and_fine_tuning()
+
+    def load_and_inference(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        # inference
+        x = paddle.randn([1, IMAGE_SIZE], 'float32')
+
+        self.layer.eval()
+        orig_pred = self.layer(x)
+
+        translated_layer.eval()
+        pred = translated_layer(x)
+
+        self.assertTrue(np.array_equal(orig_pred.numpy(), pred.numpy()))
+
+    def load_and_fine_tuning(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        # train original layer continue
+        self.layer.train()
+        orig_loss = train(self.layer, self.loader, self.loss_fn, self.sgd)
+
+        # fine-tuning
+        translated_layer.train()
+        sgd = opt.SGD(learning_rate=0.001,
+                      parameters=translated_layer.parameters())
+        loss = train(translated_layer, self.loader, self.loss_fn, sgd)
+
+        self.assertTrue(
+            np.array_equal(orig_loss.numpy(), loss.numpy()),
+            msg="original loss:\n{}\nnew loss:\n{}\n".format(orig_loss.numpy(),
+                                                             loss.numpy()))
+
+    def test_get_program(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        program = translated_layer.program()
+        self.assertTrue(isinstance(program, paddle.static.Program))
+
+    def test_get_program_method_not_exists(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        with self.assertRaises(ValueError):
+            program = translated_layer.program('not_exists')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
index 49924b44441aa9ae323f0d7921d71bf58b8c2cf2..245c2623b869af30acfb5d0379c7597813645031 100755
--- a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
@@ -26,6 +26,9 @@ def trilinear_interp_np(input,
                         out_d,
                         out_h,
                         out_w,
+                        scale_d=0,
+                        scale_h=0,
+                        scale_w=0,
                         out_size=None,
                         actual_shape=None,
                         align_corners=True,
@@ -49,17 +52,26 @@ def trilinear_interp_np(input,
         if (align_corners):
             ratio_d = (in_d - 1.0) / (out_d - 1.0)
         else:
-            ratio_d = 1.0 * in_d / out_d
+            if scale_d > 0:
+                ratio_d = 1.0 / scale_d
+            else:
+                ratio_d = 1.0 * in_d / out_d
     if out_h > 1:
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
     if out_w > 1:
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_d, out_h, out_w))
 
@@ -133,6 +145,9 @@ class TestTrilinearInterpOp(OpTest):
         self.op_type = "trilinear_interp_v2"
         input_np = np.random.random(self.input_shape).astype("float32")
 
+        scale_w = 0
+        scale_h = 0
+        scale_d = 0
         if self.data_layout == "NCDHW":
             in_d = self.input_shape[2]
             in_h = self.input_shape[3]
@@ -159,9 +174,10 @@ class TestTrilinearInterpOp(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = trilinear_interp_np(
-            input_np, out_d, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.align_mode, self.data_layout)
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w, scale_d,
+                                        scale_h, scale_w, self.out_size,
+                                        self.actual_shape, self.align_corners,
+                                        self.align_mode, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -359,7 +375,7 @@ class TestTrilinearInterpOpUint8(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w, 0, 0, 0,
                                         self.out_size, self.actual_shape,
                                         self.align_corners, self.align_mode)
         self.inputs = {'X': input_np}
@@ -482,7 +498,7 @@ class TestTrilinearInterpZero(TestTrilinearInterpOp):
         self.out_d = 60
         self.out_h = 40
         self.out_w = 25
-        self.scale = 0.2
+        self.scale = 0.0
         self.align_corners = False
         self.align_mode = 0
 
@@ -541,7 +557,7 @@ class TestTrilinearInterpOp_attr_tensor(OpTest):
             if isinstance(self.scale, list) and len(self.scale) == 1:
                 self.scale = [self.scale[0], self.scale[0], self.scale[0]]
             self.attrs['scale'] = self.scale
-        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w, 0, 0, 0,
                                         self.out_size, self.actual_shape,
                                         self.align_corners, self.align_mode)
         self.outputs = {'Out': output_np}
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 56dc27a9a5b136829ce410b50998e23b77510665..5ecf25c53b794f07e298b986eff5700698b8bff7 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -239,12 +239,12 @@ class TestUniformRandomOpSelectedRows(unittest.TestCase):
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[100, 784],
+            shape=[1000, 784],
             min=-5.0,
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -260,15 +260,15 @@ class TestUniformRandomOpSelectedRowsWithDiagInit(
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[100, 784],
+            shape=[500, 784],
             min=-5.0,
             max=10.0,
             seed=10,
-            diag_num=100,
+            diag_num=500,
             diag_step=784,
             diag_val=1.0)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [500, 784])
         hist, prob = output_hist_diag(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -391,7 +391,7 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_tensor = scope.var("Shape").get_tensor()
-        shape_tensor.set(np.array([100, 784]).astype("int64"), place)
+        shape_tensor.set(np.array([1000, 784]).astype("int64"), place)
         paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
@@ -401,7 +401,7 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -423,7 +423,7 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_1 = scope.var("shape1").get_tensor()
-        shape_1.set(np.array([100]).astype("int64"), place)
+        shape_1.set(np.array([1000]).astype("int64"), place)
         shape_2 = scope.var("shape2").get_tensor()
         shape_2.set(np.array([784]).astype("int64"), place)
         paddle.manual_seed(10)
@@ -435,7 +435,7 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -540,24 +540,24 @@ class TestUniformDtype(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
             paddle.tensor.random.uniform([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
             out = paddle.tensor.random.uniform([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
             out = paddle.tensor.random.uniform([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 9382d53e7fec6ba9e1217f99ba5006b3dfe5c150..6f713172f1b29d0df8eed212ab1b148d00d7d45e 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -134,29 +134,60 @@ class API_TestUnsqueeze3(unittest.TestCase):
             result1, = exe.run(feed={"data1": input,
                                      "data2": input2},
                                fetch_list=[result_squeeze])
-            self.assertTrue(np.allclose(input1, result1))
+            self.assertTrue(np.array_equal(input1, result1))
+            self.assertEqual(input1.shape, result1.shape)
 
 
 class API_TestDyUnsqueeze(unittest.TestCase):
     def test_out(self):
         with fluid.dygraph.guard():
             input_1 = np.random.random([5, 1, 10]).astype("int32")
-            input1 = np.squeeze(input_1, axis=1)
+            input1 = np.expand_dims(input_1, axis=1)
             input = fluid.dygraph.to_variable(input_1)
             output = paddle.unsqueeze(input, axis=[1])
             out_np = output.numpy()
-            self.assertTrue(np.allclose(input1, out_np))
+            self.assertTrue(np.array_equal(input1, out_np))
+            self.assertEqual(input1.shape, out_np.shape)
 
 
 class API_TestDyUnsqueeze2(unittest.TestCase):
     def test_out(self):
         with fluid.dygraph.guard():
-            input_1 = np.random.random([5, 1, 10]).astype("int32")
-            input1 = np.squeeze(input_1, axis=1)
-            input = fluid.dygraph.to_variable(input_1)
+            input1 = np.random.random([5, 10]).astype("int32")
+            out1 = np.expand_dims(input1, axis=1)
+            input = fluid.dygraph.to_variable(input1)
             output = paddle.unsqueeze(input, axis=1)
             out_np = output.numpy()
-            self.assertTrue(np.allclose(input1, out_np))
+            self.assertTrue(np.array_equal(out1, out_np))
+            self.assertEqual(out1.shape, out_np.shape)
+
+
+class API_TestDyUnsqueezeAxisTensor(unittest.TestCase):
+    def test_out(self):
+        with fluid.dygraph.guard():
+            input1 = np.random.random([5, 10]).astype("int32")
+            out1 = np.expand_dims(input1, axis=1)
+            input = fluid.dygraph.to_variable(input1)
+            output = paddle.unsqueeze(input, axis=paddle.to_tensor([1]))
+            out_np = output.numpy()
+            self.assertTrue(np.array_equal(out1, out_np))
+            self.assertEqual(out1.shape, out_np.shape)
+
+
+class API_TestDyUnsqueezeAxisTensorList(unittest.TestCase):
+    def test_out(self):
+        with fluid.dygraph.guard():
+            input1 = np.random.random([5, 10]).astype("int32")
+            # Actually, expand_dims supports tuple since version 1.18.0
+            out1 = np.expand_dims(input1, axis=1)
+            out1 = np.expand_dims(out1, axis=2)
+            input = fluid.dygraph.to_variable(input1)
+            output = paddle.unsqueeze(
+                fluid.dygraph.to_variable(input1),
+                axis=[paddle.to_tensor([1]), paddle.to_tensor([2])])
+            out_np = output.numpy()
+            self.assertTrue(np.array_equal(out1, out_np))
+            self.assertEqual(out1.shape, out_np.shape)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index c8383bb950d3ed7b2cdfafa185b0ad156bf7c7bf..deb49a3ffc2b5febf97680bc652e9695fb253373 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -33,16 +33,28 @@ class TestVarBase(unittest.TestCase):
         def _test_place(place):
             with fluid.dygraph.guard():
                 paddle.set_default_dtype('float32')
+                # set_default_dtype should not take effect on int
                 x = paddle.to_tensor(1, place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1]))
                 self.assertNotEqual(x.dtype, core.VarDesc.VarType.FP32)
 
+                # set_default_dtype should not take effect on numpy
+                x = paddle.to_tensor(
+                    np.array([1.2]).astype('float16'),
+                    place=place,
+                    stop_gradient=False)
+                self.assertTrue(
+                    np.array_equal(x.numpy(), np.array([1.2], 'float16')))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP16)
+
+                # set_default_dtype take effect on float
                 x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
                 self.assertTrue(
                     np.array_equal(x.numpy(), np.array([1.2]).astype(
                         'float32')))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
 
+                # set_default_dtype take effect on complex
                 x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
                 self.assertEqual(x.dtype, 'complex64')
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index af788874191335ad31d1540bcc0db90cc12383c6..f33e4e0fca8727574bcd1970e26c6eaee2139a05 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -20,8 +20,8 @@ __all__ = [
 ]
 
 __all__ += [
-    'grad', 'LayerList', 'load', 'save', 'to_variable', 'no_grad',
-    'DataParallel'
+    'grad', 'LayerList', 'load', 'save', 'SaveLoadConfig', 'to_variable',
+    'no_grad', 'DataParallel'
 ]
 
 __all__ += [
@@ -50,6 +50,7 @@ from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 2555d24464112ed8446d863dc8e65cfa37680b36..ba2cf603d4a69f118320e40f1f953cb4c5fcfb39 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -16,7 +16,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 
-__all__ = ['manual_seed']
+__all__ = ['manual_seed', 'get_cuda_rng_state', 'set_cuda_rng_state']
 
 
 def manual_seed(seed):
@@ -42,10 +42,69 @@ def manual_seed(seed):
 
     seed = int(seed)
 
+    if core.is_compiled_with_cuda():
+        for i in range(core.get_cuda_device_count()):
+            core.default_cuda_generator(i)._is_init_py = True
+            core.default_cuda_generator(i).manual_seed(seed)
+
     core.default_cpu_generator()._is_init_py = True
     return core.default_cpu_generator().manual_seed(seed)
 
 
+def get_cuda_rng_state():
+    """
+
+    Get random state of cuda generators.
+
+    Args:
+        None
+
+    Returns:
+        GeneratorState:  object.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            sts = paddle.get_cuda_rng_state()
+
+    """
+    state_list = []
+    if core.is_compiled_with_cuda():
+        for i in range(core.get_cuda_device_count()):
+            state_list.append(core.default_cuda_generator(i).get_state())
+
+    return state_list
+
+
+def set_cuda_rng_state(state_list):
+    """
+
+    Sets generator state for all cuda generators
+
+    Args:
+        state_list(list): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state().
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            sts = paddle.get_cuda_rng_state()
+            paddle.set_cuda_rng_state(sts)
+
+    """
+    if core.is_compiled_with_cuda():
+        if not len(state_list) == core.get_cuda_device_count():
+            raise ValueError(
+                "Length of cuda state list shoule be equal to the cuda device count"
+            )
+        for i in range(core.get_cuda_device_count()):
+            core.default_cuda_generator(i).set_state(state_list[i])
+
+
 def _manual_program_seed(seed):
     """
     Sets global seed for generating random numbers.
diff --git a/python/paddle/hapi/__init__.py b/python/paddle/hapi/__init__.py
index 87f5a82525cdfa36e48d40c6d12488d359fe99db..67965de5d97621e188acfa1e0384325b9ec5b7aa 100644
--- a/python/paddle/hapi/__init__.py
+++ b/python/paddle/hapi/__init__.py
@@ -14,14 +14,12 @@
 
 from . import logger
 from . import callbacks
+from . import model_summary
 
 from . import model
 from .model import *
-
-from .dygraph_layer_patch import monkey_patch_layer
+from .model_summary import summary
 
 logger.setup_logger()
 
-__all__ = ['callbacks'] + model.__all__
-
-monkey_patch_layer()
+__all__ = ['callbacks'] + model.__all__ + ['summary']
diff --git a/python/paddle/hapi/dygraph_layer_patch.py b/python/paddle/hapi/dygraph_layer_patch.py
deleted file mode 100644
index e3a2948b69305fcb08c14c850f5738ac46aea2be..0000000000000000000000000000000000000000
--- a/python/paddle/hapi/dygraph_layer_patch.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-import paddle.fluid as fluid
-from paddle.fluid.framework import in_dygraph_mode
-from paddle.fluid.framework import _current_expected_place as _get_device
-
-
-def monkey_patch_layer():
-    def load_dict(self,
-                  stat_dict,
-                  include_sublayers=True,
-                  use_structured_name=True):
-        '''
-        Set parameters from stat_dict. All the parameters will be reset by the
-        tensor in the stat_dict
-
-        This api will be Deprecated. Please use set_dict
-
-        Parameters:
-            state_dict(dict) : Dict contains all the parameters
-            include_sublayers(bool, optional) : If true, also include the
-                parameters from sublayers. Default: True
-            use_structured_name(bool, optional) : If true, use structured name
-                as key, otherwise, use parameter name as key. Default: True
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
-
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-                    emb.load_dict( para_state_dict )
-
-        '''
-
-        def _check_match(key, param):
-            state = stat_dict.get(key, None)
-            if state is None:
-                raise ValueError(
-                    "{} is not found in the providing file.".format(key))
-            if list(state.shape) != list(param.shape):
-                raise ValueError(
-                    "{} receives a shape {}, but the expected shape is {}.".
-                    format(key, list(state.shape), list(param.shape)))
-            return param, state
-
-        matched_param_state = []
-        for key, param in self.state_dict().items():
-            key_name = key if use_structured_name else param.name
-            try:
-                match_res = _check_match(key_name, param)
-                matched_param_state.append(match_res)
-            except ValueError as err:
-                warnings.warn(("Skip loading for {}. ".format(key) + str(err)))
-
-        if in_dygraph_mode():
-            for param, state in matched_param_state:
-                param.set_value(state)
-        else:
-
-            def _set_var(var, ndarray):
-                t = fluid.global_scope().find_var(var.name).get_tensor()
-                p = t._place()
-                if p.is_cpu_place():
-                    place = fluid.CPUPlace()
-                elif p.is_cuda_pinned_place():
-                    place = fluid.CUDAPinnedPlace()
-                else:
-                    p = fluid.core.Place()
-                    p.set_place(t._place())
-                    place = fluid.CUDAPlace(p.gpu_device_id())
-                t.set(ndarray, place)
-
-            executor = fluid.Executor(_get_device())._default_executor
-            # restore parameter states
-            fluid.core._create_loaded_parameter(
-                [param for param, state in matched_param_state],
-                fluid.global_scope(), executor)
-            for param, state in matched_param_state:
-                _set_var(param, state)
-
-    setattr(fluid.dygraph.Layer, 'load_dict', load_dict)
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index bba94d56cca8e77735d8921d007248b2e388a5f6..2836a151ec35698a31f3814d573828853349a151 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -47,10 +47,10 @@ from paddle.io import DataLoader, Dataset, DistributedBatchSampler
 from paddle.fluid.executor import scope_guard, Executor
 from paddle.fluid.dygraph.layers import Layer
 from paddle.metric import Metric
-
 from paddle.static import InputSpec as Input
 
 from .callbacks import config_callbacks
+from .model_summary import summary
 
 __all__ = ['Model', ]
 
@@ -1828,6 +1828,54 @@ class Model(object):
             return logs, outputs
         return logs
 
+    def summary(self, input_size=None, batch_size=None, dtype=None):
+        """Prints a string summary of the network.
+
+        Args:
+            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. 
+                    if not set, input_size will get from ``self._inputs`` if network only have 
+                    one input, input_size can be tuple or InputSpec. if model have multiple 
+                    input, input_size must be a list which contain every input's shape. 
+                    Default: None.
+            batch_size (int, optional): batch size of input tensor, Default: None.
+            dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
+
+        Returns:
+            Dict: a summary of the network including total params and total trainable params.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              from paddle.static import InputSpec
+
+              dynamic = True
+              device = paddle.set_device('cpu')
+              paddle.disable_static(device) if dynamic else None
+           
+              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+              label = InputSpec([None, 1], 'int64', 'label')
+           
+              model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+                  input, label)
+              optim = paddle.optimizer.Adam(
+                  learning_rate=0.001, parameters=model.parameters())
+              model.prepare(
+                  optim,
+                  paddle.nn.CrossEntropyLoss())
+
+              params_info = model.summary()
+              print(params_info)
+
+        """
+        assert (input_size is not None or self._inputs is not None
+                ), "'input_size' or 'self._input' must be set"
+        if input_size is not None:
+            _input_size = input_size
+        else:
+            _input_size = self._inputs
+        return summary(self.network, _input_size, batch_size, dtype)
+
     def _verify_spec(self, specs, is_input=False):
         out_specs = []
 
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..716be1b539809ea3f90885b512f51ac45d85cd37
--- /dev/null
+++ b/python/paddle/hapi/model_summary.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+from paddle.static import InputSpec
+
+from collections import OrderedDict
+
+__all__ = ['summary']
+
+
+def summary(net, input_size, batch_size=None, dtypes=None):
+    """Prints a string summary of the network.
+
+    Args:
+        net (Layer): the network which must be a subinstance of Layer.
+        input_size (tuple|InputSpec|list[tuple|InputSpec]): size of input tensor. if model only 
+                    have one input, input_size can be tuple or InputSpec. if model
+                    have multiple input, input_size must be a list which contain 
+                    every input's shape.
+        batch_size (int, optional): batch size of input tensor, Default: None.
+        dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
+
+    Returns:
+        Dict: a summary of the network including total params and total trainable params.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            class LeNet(nn.Layer):
+                def __init__(self, num_classes=10):
+                    super(LeNet, self).__init__()
+                    self.num_classes = num_classes
+                    self.features = nn.Sequential(
+                        nn.Conv2d(
+                            1, 6, 3, stride=1, padding=1),
+                        nn.ReLU(),
+                        nn.MaxPool2d(2, 2),
+                        nn.Conv2d(
+                            6, 16, 5, stride=1, padding=0),
+                        nn.ReLU(),
+                        nn.MaxPool2d(2, 2))
+
+                    if num_classes > 0:
+                        self.fc = nn.Sequential(
+                            nn.Linear(400, 120),
+                            nn.Linear(120, 84),
+                            nn.Linear(
+                                84, 10))
+
+                def forward(self, inputs):
+                    x = self.features(inputs)
+
+                    if self.num_classes > 0:
+                        x = paddle.flatten(x, 1)
+                        x = self.fc(x)
+                    return x
+
+            lenet = LeNet()
+
+            params_info = paddle.summary(lenet, (1, 28, 28))
+            print(params_info)
+
+    """
+    if isinstance(input_size, InputSpec):
+        _input_size = tuple(input_size.shape[1:])
+        if batch_size is None:
+            batch_size = input_size.shape[0]
+    elif isinstance(input_size, list):
+        _input_size = []
+        for item in input_size:
+            if isinstance(item, int):
+                item = (item, )
+            assert isinstance(item,
+                              (tuple, InputSpec)), 'When input_size is list, \
+            expect item in input_size is a tuple or InputSpec, but got {}'.format(
+                                  type(item))
+
+            if isinstance(item, InputSpec):
+                _input_size.append(tuple(item.shape[1:]))
+                if batch_size is None:
+                    batch_size = item.shape[0]
+            else:
+                _input_size.append(item)
+    elif isinstance(input_size, int):
+        _input_size = (input_size, )
+    else:
+        _input_size = input_size
+
+    if batch_size is None:
+        batch_size = -1
+
+    result, params_info = summary_string(net, _input_size, batch_size, dtypes)
+    print(result)
+
+    return params_info
+
+
+def summary_string(model, input_size, batch_size=-1, dtypes=None):
+    if dtypes == None:
+        dtypes = ['float32'] * len(input_size)
+
+    summary_str = ''
+
+    depth = len(list(model.sublayers()))
+
+    def register_hook(module):
+        def hook(module, input, output):
+            class_name = str(module.__class__).split(".")[-1].split("'")[0]
+
+            try:
+                module_idx = int(module._full_name.split('_')[-1])
+            except:
+                module_idx = len(summary)
+
+            m_key = "%s-%i" % (class_name, module_idx + 1)
+            summary[m_key] = OrderedDict()
+            summary[m_key]["input_shape"] = list(input[0].shape)
+            summary[m_key]["input_shape"][0] = batch_size
+            if isinstance(output, (list, tuple)):
+                summary[m_key]["output_shape"] = [[-1] + list(o.shape)[1:]
+                                                  for o in output]
+            else:
+                summary[m_key]["output_shape"] = list(output.shape)
+                summary[m_key]["output_shape"][0] = batch_size
+
+            params = 0
+            if hasattr(module, "weight") and hasattr(module.weight, "shape"):
+                params += np.prod(module.weight.shape)
+                summary[m_key]["trainable"] = module.weight.trainable or (
+                    not module.weight.stop_gradient)
+            if hasattr(module, "bias") and hasattr(module.bias, "shape"):
+                params += np.prod(module.bias.shape)
+            summary[m_key]["nb_params"] = params
+
+        if (not isinstance(module, nn.Sequential) and
+                not isinstance(module, nn.LayerList) and
+            (not (module == model) or depth < 1)):
+
+            hooks.append(module.register_forward_post_hook(hook))
+
+    if isinstance(input_size, tuple):
+        input_size = [input_size]
+
+    x = [
+        paddle.rand(
+            [2] + list(in_size), dtype=dtype)
+        for in_size, dtype in zip(input_size, dtypes)
+    ]
+
+    # create properties
+    summary = OrderedDict()
+    hooks = []
+
+    # register hook
+    model.apply(register_hook)
+
+    # make a forward pass
+    model(*x)
+
+    # remove these hooks
+    for h in hooks:
+        h.remove()
+
+    table_width = 80
+    summary_str += "-" * table_width + "\n"
+    line_new = "{:>15} {:>20} {:>20} {:>15}".format(
+        "Layer (type)", "Input Shape", "Output Shape", "Param #")
+    summary_str += line_new + "\n"
+    summary_str += "=" * table_width + "\n"
+    total_params = 0
+    total_output = 0
+    trainable_params = 0
+    for layer in summary:
+        # input_shape, output_shape, trainable, nb_params
+        line_new = "{:>15} {:>20} {:>20} {:>15}".format(
+            layer,
+            str(summary[layer]["input_shape"]),
+            str(summary[layer]["output_shape"]),
+            "{0:,}".format(summary[layer]["nb_params"]), )
+        total_params += summary[layer]["nb_params"]
+
+        total_output += np.prod(summary[layer]["output_shape"])
+        if "trainable" in summary[layer]:
+            if summary[layer]["trainable"] == True:
+                trainable_params += summary[layer]["nb_params"]
+        summary_str += line_new + "\n"
+
+    # assume 4 bytes/number (float on cuda).
+    total_input_size = abs(
+        np.prod(sum(input_size, ())) * batch_size * 4. / (1024**2.))
+    total_output_size = abs(2. * total_output * 4. /
+                            (1024**2.))  # x2 for gradients
+    total_params_size = abs(total_params * 4. / (1024**2.))
+    total_size = total_params_size + total_output_size + total_input_size
+
+    summary_str += "=" * table_width + "\n"
+    summary_str += "Total params: {0:,}".format(total_params) + "\n"
+    summary_str += "Trainable params: {0:,}".format(trainable_params) + "\n"
+    summary_str += "Non-trainable params: {0:,}".format(total_params -
+                                                        trainable_params) + "\n"
+    summary_str += "-" * table_width + "\n"
+    summary_str += "Input size (MB): %0.2f" % total_input_size + "\n"
+    summary_str += "Forward/backward pass size (MB): %0.2f" % total_output_size + "\n"
+    summary_str += "Params size (MB): %0.2f" % total_params_size + "\n"
+    summary_str += "Estimated Total Size (MB): %0.2f" % total_size + "\n"
+    summary_str += "-" * table_width + "\n"
+    # return summary
+    return summary_str, {
+        'total_params': total_params,
+        'trainable_params': trainable_params
+    }
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
index 03299a3bb9823d31c40ae4faab601ed89570c71e..d04a65ad6ea99ee2e2e67e47fd9d656f1572a02d 100644
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
@@ -14,7 +14,6 @@
 
 from ..fluid.dygraph.jit import save  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import load  #DEFINE_ALIAS
-from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import TracedLayer  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import set_code_level  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import set_verbosity  #DEFINE_ALIAS
@@ -23,6 +22,6 @@ from ..fluid.dygraph import ProgramTranslator  #DEFINE_ALIAS
 from ..fluid.dygraph.io import TranslatedLayer  #DEFINE_ALIAS
 
 __all__ = [
-    'save', 'load', 'SaveLoadConfig', 'TracedLayer', 'to_static',
-    'ProgramTranslator', 'TranslatedLayer', 'set_code_level', 'set_verbosity'
+    'save', 'load', 'TracedLayer', 'to_static', 'ProgramTranslator',
+    'TranslatedLayer', 'set_code_level', 'set_verbosity'
 ]
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 66caba540f2fed8c035d0f1af14f9e40a329bca5..79583f344f0c1f642586c4a8ecc08f2aa4e24008 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -89,7 +89,7 @@ from .layer.common import CosineSimilarity  #DEFINE_ALIAS
 from .layer.common import Embedding  #DEFINE_ALIAS
 from .layer.common import Linear  #DEFINE_ALIAS
 from .layer.common import Flatten  #DEFINE_ALIAS
-from .layer.common import UpSample  #DEFINE_ALIAS
+from .layer.common import Upsample  #DEFINE_ALIAS
 from .layer.common import UpsamplingNearest2d  #DEFINE_ALIAS
 from .layer.common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .layer.common import Bilinear  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 325eaa64d5ca4bd3d65bf266ff0a42226a3199e6..f3cc8c610ff4da16b6333931913396d84cc05981 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -72,6 +72,7 @@ from .common import unfold  #DEFINE_ALIAS
 # from .common import bilinear_tensor_product        #DEFINE_ALIAS
 from .common import assign  #DEFINE_ALIAS
 from .common import interpolate  #DEFINE_ALIAS
+from .common import upsample  #DEFINE_ALIAS
 from .common import bilinear  #DEFINE_ALIAS
 from .conv import conv1d  #DEFINE_ALIAS
 from .conv import conv_transpose1d  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 633920dc7e804d18ff63cce4ddb9aa2b215c0bd5..014c778eee98a386e82fffe46dcc932d55aa6574 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -80,6 +80,8 @@ def interpolate(x,
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    Where in_w is width of the input tensor, in_h is the height of the input tensor,
+    in_d is the depth of the intput tensor.
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
@@ -88,6 +90,7 @@ def interpolate(x,
         'trilinear' : Trilinear interpolation
         'nearest' : Nearest neighbor interpolation
         'bicubic' : Bicubic interpolation
+        'area': Area interpolation
 
     Linear interpolation is the method of using a line connecting two known quantities 
     to determine the value of an unknown quantity between the two known quantities. 
@@ -114,6 +117,12 @@ def interpolate(x,
     smoother than corresponding surfaces obtained by bilinear interpolation or
     nearest-neighbor interpolation.
 
+    Area interpolation is to perform area interpolation
+    in both the 3rd dimension(in height direction) , the 4th dimension(in width
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to 
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or 
+    `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
+
     Example:
 
     .. code-block:: text
@@ -207,11 +216,11 @@ def interpolate(x,
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
+        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if it is either a list or a tuple or a Tensor.
              Default: None.
-        mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
+        mode (str): The resample method. It supports 'linear', 'area', 'nearest', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
         align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the
                                input and output tensors are aligned, preserving the values at the
@@ -235,7 +244,7 @@ def interpolate(x,
     Raises:
         TypeError: size should be a list or tuple or Tensor.
         ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
-                    'trilinear', 'bicubic', or 'nearest' currently.
+                    'trilinear', 'bicubic', 'area' or 'nearest' currently.
         ValueError: 'linear' only support 3-D tensor.
         ValueError: 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
         ValueError: 'trilinear' only support 5-D tensor.
@@ -283,10 +292,11 @@ def interpolate(x,
         'TRILINEAR',
         'NEAREST',
         'BICUBIC',
+        'AREA',
     ]
     if resample not in resample_methods:
         raise ValueError(
-            "The 'resample' of image_resize can only be 'linaer', 'bilinear', 'trilinear', "
+            "The 'resample' of image_resize can only be 'area', 'linear', 'bilinear', 'trilinear', "
             " 'bicubic' or 'nearest' currently.")
 
     if resample in ['LINEAR'] and len(x.shape) != 3:
@@ -310,8 +320,17 @@ def interpolate(x,
         raise ValueError(
             "align_corners option can only be set with the interpolating modes: linear | bilinear | bicubic | trilinear"
         )
+
+    if resample == 'AREA' and len(x.shape) == 3:
+        return paddle.nn.functional.adaptive_avg_pool1d(x, size)
+
+    if resample == 'AREA' and len(x.shape) == 4:
+        return paddle.nn.functional.adaptive_avg_pool2d(x, size)
+    if resample == 'AREA' and len(x.shape) == 5:
+        return paddle.nn.functional.adaptive_avg_pool3d(x, size)
+
     helper = LayerHelper('{}_interp_v2'.format(resample_type), **locals())
-    dtype = helper.input_dtype()
+    dtype = helper.input_dtype(input_param_name='x')
     if len(x.shape) == 3 and data_format not in ['NCW', 'NWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
@@ -349,14 +368,15 @@ def interpolate(x,
 
     out_shape = size
     scale = scale_factor
+    if out_shape is not None and scale is not None:
+        raise ValueError("Only one of size or scale_factor should be defined.")
     if out_shape is not None:
         if isinstance(out_shape, Variable):
             out_shape.stop_gradient = True
             inputs['OutSize'] = out_shape
         else:
             if not (_is_list_or_turple_(out_shape)):
-                raise TypeError(
-                    "out_shape should be a list or tuple or Variable.")
+                raise TypeError("size should be a list or tuple or Variable.")
             # Validate the shape
             contain_var = False
             for dim_idx, dim_size in enumerate(out_shape):
@@ -388,7 +408,7 @@ def interpolate(x,
             if len(x.shape) == 3:
                 if len(out_shape) != 1:
                     raise ValueError(
-                        "out_shape length should be 2 for input 3-D tensor")
+                        "size length should be 2 for input 3-D tensor")
                 if contain_var:
                     attrs['out_w'] = size_list[0]
                 else:
@@ -396,7 +416,7 @@ def interpolate(x,
                     attrs['out_w'] = out_shape[0]
             if len(x.shape) == 4:
                 if len(out_shape) != 2:
-                    raise ValueError("out_shape length should be 2 for "
+                    raise ValueError("size length should be 2 for "
                                      "input 4-D tensor.")
                 if contain_var:
                     attrs['out_h'] = size_list[0]
@@ -407,7 +427,7 @@ def interpolate(x,
                     attrs['out_w'] = out_shape[1]
             if len(x.shape) == 5:
                 if len(out_shape) != 3:
-                    raise ValueError("out_shape length should be 3 for "
+                    raise ValueError("size length should be 3 for "
                                      "input 5-D tensor.")
                 if contain_var:
                     attrs['out_d'] = size_list[0]
@@ -430,7 +450,7 @@ def interpolate(x,
             for i in range(len(x.shape) - 2):
                 scale_list.append(scale)
             attrs['scale'] = list(map(float, scale_list))
-        elif isinstance(scale, list):
+        elif isinstance(scale, list) or isinstance(scale, float):
             if len(scale) != len(x.shape) - 2:
                 raise ValueError("scale_shape length should be {} for "
                                  "input {}-D tensor.".format(
@@ -441,7 +461,8 @@ def interpolate(x,
             attrs['scale'] = list(map(float, scale))
         else:
             raise TypeError(
-                "Attr(scale)'s type should be float, int, list or Tensor.")
+                "Attr(scale)'s type should be float, int, list, tuple, or Tensor."
+            )
 
     if in_dygraph_mode():
         attr_list = []
@@ -480,9 +501,12 @@ def upsample(x,
              name=None):
     """
     This op resizes a batch of images.
+
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    Where in_w is width of the input tensor, in_h is the height of the input tensor,
+    in_d is the depth of the intput tensor.
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
@@ -507,12 +531,21 @@ def upsample(x,
     data points on a two-dimensional regular grid. The interpolated surface is
     smoother than corresponding surfaces obtained by bilinear interpolation or
     nearest-neighbor interpolation.
+
     Trilinear interpolation is an extension of linear interpolation for
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
+
     The linear interpolation is performed on three directions.
     align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
+
+    Area interpolation is to perform area interpolation
+    in both the 3rd dimension(in height direction) , the 4th dimension(in width
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or
+    `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
+
     Example:
     .. code-block:: text
         For scale_factor:
@@ -605,9 +638,10 @@ def upsample(x,
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if 
+             it is either a list or a tuple or a Tensor.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
@@ -1091,6 +1125,8 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
                                  'alpha_dropout')
 
     if training:
+        if p == 1:
+            return layers.scale(x, scale=0.)
         #get transformation params
         alpha = 1.6732632423543772848170429916717
         scale = 1.0507009873554804934193349852946
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 42d7d98aefcbbf51f562b98c4c494aeccfe20cf2..3c1482e69c3c36232ee5d70f2156a8d16c2d212a 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -232,7 +232,7 @@ def conv1d(x,
         raise ValueError("Attr(data_format) should be 'NCL' or 'NLC'. "
                          "Received Attr(data_format): {}.".format(data_format))
 
-    channel_last = (data_format == "NHWC")
+    channel_last = (data_format == "NLC")
     channel_dim = -1 if channel_last else 1
     conv2d_data_format = "NHWC" if channel_last else "NCHW"
     num_channels = x.shape[channel_dim]
@@ -399,7 +399,7 @@ def conv2d(x,
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when 
             `data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0], 
             [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel
@@ -733,20 +733,31 @@ def conv_transpose1d(x,
 
     stride = utils.convert_to_list(stride, 1, 'stride') + [1]
     dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
-    output_padding = utils.convert_to_list(output_padding, 1,
-                                           'output_padding') + [0]
-    if output_padding[0] > stride[0]:
-        raise ValueError(
-            "The size of output_padding should not be greater than stride."
-            "But got output_padding={} and stride={}".format(output_padding[0],
-                                                             stride[0]))
 
     if output_size is None:
         output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 1, 'output_size') + [1]
     else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
+        if output_padding != 0:
+            raise ValueError('output_padding option is mutually exclusive with '
+                             'output_size')
+        if isinstance(output_size, (list, tuple, int)):
+            output_size = utils.convert_to_list(output_size, 1,
+                                                'output_size') + [1]
+        else:
+            raise ValueError(
+                "output_size should be int, or list, tuple of ints")
+
+    if output_padding == 0:
+        output_padding = []
+    else:
+        output_padding = utils.convert_to_list(output_padding, 1,
+                                               'output_padding') + [0]
+
+    if len(output_padding) > 0 and output_padding[0] > stride[0]:
+        raise ValueError(
+            "The size of output_padding should not be greater than stride."
+            "But got output_padding={} and stride={}".format(output_padding[0],
+                                                             stride[0]))
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
@@ -761,16 +772,17 @@ def conv_transpose1d(x,
     weight = nn.unsqueeze(input=weight, axes=[-1])
 
     if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
-                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
-                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
-                 'data_format', conv2d_data_format)
+        attrs = ('output_padding', output_padding, 'output_size', output_size,
+                 'strides', stride, 'paddings', padding, 'padding_algorithm',
+                 padding_algorithm, 'dilations', dilation, 'groups', groups,
+                 'use_cudnn', use_cudnn, 'data_format', conv2d_data_format)
         out = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     else:
         inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
+            'output_padding': output_padding,
             'output_size': output_size,
             'strides': stride,
             'paddings': padding,
@@ -791,12 +803,6 @@ def conv_transpose1d(x,
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
 
-    if output_size is None:
-        out = pad2d(
-            out,
-            padding=[0, output_padding, 0, 0],
-            data_format=conv2d_data_format,
-            name=name)
     out = nn.squeeze(input=out, axes=[squeeze_axis])
     return out
 
@@ -888,9 +894,9 @@ def conv_transpose2d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_height, pad_width]` or 
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCHW"`, `pool_padding` can be in the form 
+            and when `data_format` is `"NCHW"`, `padding` can be in the form 
             `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form 
+            when `data_format` is `"NHWC"`, `padding` can be in the form 
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
@@ -1116,9 +1122,9 @@ def conv3d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `padding` can be in the form
             `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
@@ -1340,9 +1346,9 @@ def conv_transpose3d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `padding` can be in the form
             `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index d2ddee654f4d04de152d15130ba53c424af3e5b2..3d5894064c44cb72259472fc638d46b67c5703fc 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -138,13 +138,10 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            input_data = np.array([0.5, 0.6, 0.7]).astype("float32")
-            label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
 
             paddle.disable_static()
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.to_tensor([0.5, 0.6, 0.7], 'float32')
+            label = paddle.to_tensor([1.0, 0.0, 1.0], 'float32')
             output = paddle.nn.functional.binary_cross_entropy(input, label)
             print(output.numpy())  # [0.65537095]
 
@@ -277,8 +274,8 @@ def binary_cross_entropy_with_logits(logit,
 
             import paddle
             paddle.disable_static()
-            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
-            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
+            logit = paddle.to_tensor([5.0, 1.0, 3.0])
+            label = paddle.to_tensor([1.0, 0.0, 1.0])
             output = paddle.nn.functional.binary_cross_entropy_with_logits(logit, label)
             print(output.numpy())  # [0.45618808]
 
@@ -569,13 +566,10 @@ def l1_loss(input, label, reduction='mean', name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
-            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.to_tensor([[1.5, 0.8], [0.2, 1.3]])
+            label = paddle.to_tensor([[1.7, 1], [0.4, 0.5]])
 
             l1_loss = paddle.nn.functional.l1_loss(input, label)
             print(l1_loss.numpy())
@@ -868,7 +862,7 @@ def mse_loss(input, label, reduction='mean', name=None):
     Examples:
 
         .. code-block:: python
-            import numpy as np
+
             import paddle
 
 
@@ -878,8 +872,6 @@ def mse_loss(input, label, reduction='mean', name=None):
             input = paddle.data(name="input", shape=[1])
             label = paddle.data(name="label", shape=[1])
             place = paddle.CPUPlace()
-            input_data = np.array([1.5]).astype("float32")
-            label_data = np.array([1.7]).astype("float32")
 
             output = mse_loss(input,label)
             exe = paddle.static.Executor(place)
@@ -894,8 +886,8 @@ def mse_loss(input, label, reduction='mean', name=None):
 
             # dynamic graph mode
             paddle.disable_static()
-            input = paddle.to_variable(input_data)
-            label = paddle.to_variable(label_data)
+            input = paddle.to_tensor(1.5)
+            label = paddle.to_tensor(1.7)
             output = mse_loss(input, label)
             print(output.numpy())
             # [0.04000002]
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index f63fc33525576c61998be9facf32b4c66aa2a971..9e8f365f6d23a95275b9a696f6088bb287108ec0 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -176,14 +176,13 @@ def batch_norm(x,
     mean_out = running_mean
     variance_out = running_var
 
-    true_data_format = ['NC', 'NCL', 'NCHW', 'NCWH', 'NCDHW']
+    true_data_format = ['NC', 'NCL', 'NCHW', 'NCDHW']
     if data_format not in true_data_format:
         raise ValueError(
-            "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCWH', 'NCDHW', but receive {}".
+            "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', but receive {}".
             format(data_format))
 
-    if data_format != 'NCWH':
-        data_format = 'NCHW'
+    data_format = 'NCHW'
 
     if in_dygraph_mode():
         # for dygraph need tuple
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index b4a713a1964f5d99503e0b5a221668656fa657d1..662205ab69550255406ff5edfda4556b73b98843 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -168,7 +168,7 @@ def avg_pool1d(x,
                count_include_pad=True,
                ceil_mode=False,
                name=None):
-    """ 
+    """
     This API implements average pooling 1d operation,
     See more details in :ref:`api_nn_pooling_AvgPool1d` .
 
@@ -280,7 +280,7 @@ def avg_pool2d(x,
     """
     This API implements average pooling 2d operation.
     See more details in :ref:`api_nn_pooling_AvgPool2d` .
- 
+
     Args:
         x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
                           shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
@@ -640,7 +640,7 @@ def max_pool2d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
+        return_indices (bool): Whether to return the max indices along with the outputs. Default False, only support `"NCHW"` data format
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
@@ -690,15 +690,30 @@ def max_pool2d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode)
 
+    if data_format == "NHWC" and return_indices:
+        raise ValueError(
+            "When setting return_indices to true, data_format must be set to NCHW in API:max_pool2d"
+        )
+
     if in_dygraph_mode():
-        output = core.ops.max_pool2d_with_index(
-            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
-            'paddings', padding, 'padding_algorithm', padding_algorithm,
-            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
-            'exclusive', True, 'data_format', data_format)
-        return output if return_indices else output[0]
+        if data_format == "NCHW":
+            output = core.ops.max_pool2d_with_index(
+                x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
+                stride, 'paddings', padding, 'padding_algorithm',
+                padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+                'use_mkldnn', False, 'exclusive', True, 'data_format',
+                data_format)
+            return output if return_indices else output[0]
+        elif data_format == "NHWC" and not return_indices:
+            output = core.ops.pool2d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return output
 
-    op_type = 'max_pool2d_with_index'
+    op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "max_pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -739,7 +754,7 @@ def max_pool3d(x,
     See more details in :ref:`api_nn_pooling_MaxPool3d` .
     Args:
         x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` or `"NDHWC"`, where N represents batch size, C represents the number of channels, D, H and W represent the depth, height and width of the feature respectively. 
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` or `"NDHWC"`, where N represents batch size, C represents the number of channels, D, H and W represent the depth, height and width of the feature respectively.
         kernel_size (int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
             (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
@@ -755,7 +770,7 @@ def max_pool3d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
-        return_indices (bool): Whether to return the max indices along with the outputs.
+        return_indices (bool): Whether to return the max indices along with the outputs. Default False. Only support "NDCHW" data_format.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
                         The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_depth, input_height, input_width]`.
@@ -801,15 +816,30 @@ def max_pool3d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
 
+    if data_format == "NDHWC" and return_indices:
+        raise ValueError(
+            "When setting return_indices to true, data_format must be set to NCDHW in API:max_pool3d"
+        )
+
     if in_dygraph_mode():
-        output = core.ops.max_pool3d_with_index(
-            x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides', stride,
-            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', True, 'data_format', data_format)
-        return output if return_indices else output[0]
+        if data_format == "NCDHW":
+            output = core.ops.max_pool3d_with_index(
+                x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
+                stride, 'paddings', padding, 'global_pooling', False,
+                'padding_algorithm', padding_algorithm, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return output if return_indices else output[0]
+        elif data_format == "NDHWC" and not return_indices:
+            output = core.ops.pool3d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return output
 
-    op_type = "max_pool3d_with_index"
+    op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "max_pool3d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -841,7 +871,7 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     """
     This API implements adaptive average pooling 1d operation.
     See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
-    
+
     Args:
         x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
                               with shape [N, C, L].  The format of input tensor is NCL,
@@ -976,6 +1006,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 2, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_h
         if output_size[1] == None:
@@ -1079,6 +1110,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 3, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_l
         if output_size[1] == None:
@@ -1123,8 +1155,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
                               with shape [N, C, L].  The format of input tensor is NCL,
                               where N is batch size, C is the number of channels, L is the
                               length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
+        output_size (int): The pool kernel size. The value should be an integer.
         return_indices (bool): If true, the index of max pooling point will be returned along
                 with outputs. It cannot be set in average pooling type. Default False.
         name(str, optional): For detailed information, please refer
@@ -1134,9 +1165,10 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
             Tensor: The output tensor of adaptive pooling result. The data type is same
                       as input tensor.
     Raises:
-            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+            ValueError: 'output_size' should be an integer.
     Examples:
         .. code-block:: python
+
               # max adaptive pool1d
               # suppose input data in shape of [N, C, L], `output_size` is m or [m],
               # output shape is [N, C, m], adaptive pool divide L dimension
@@ -1162,7 +1194,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
     check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                              'adaptive_max_pool1d')
     _check_input(x, 3)
-    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
+    check_type(output_size, 'pool_size', int, 'adaptive_max_pool1d')
     check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
 
     pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
@@ -1201,15 +1233,19 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
     """
         This operation applies a 2D adaptive max pooling on input tensor.
         See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
+
         Args:
             x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
             output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
             return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
             name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+
         Returns:
             Tensor: The output tensor of adaptive max pool2d result. The data type is same as input tensor.
+
         Examples:
             .. code-block:: python
+
               # max adaptive pool2d
               # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
               # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
@@ -1247,6 +1283,7 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 2, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_h
         if output_size[1] == None:
@@ -1283,15 +1320,19 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
     """
         This operation applies a 3D adaptive max pooling on input tensor.
         See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
+
         Args:
             x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
             output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
             return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
             name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+
         Returns:
             Tensor: The output tensor of adaptive max pool3d result. The data type is same as input tensor.
+
         Examples:
             .. code-block:: python
+
               # adaptive max pool3d
               # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
               # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
@@ -1333,6 +1374,7 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 3, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_l
         if output_size[1] == None:
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 7d7a392ebe80c3af8c991dbff746d0f8f216b18b..760af09f1f2f5af066058572f681ec21f9a93180 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -59,7 +59,7 @@ from .common import CosineSimilarity  #DEFINE_ALIAS
 from .common import Embedding  #DEFINE_ALIAS
 from .common import Linear  #DEFINE_ALIAS
 from .common import Flatten  #DEFINE_ALIAS
-from .common import UpSample  #DEFINE_ALIAS
+from .common import Upsample  #DEFINE_ALIAS
 from .common import UpsamplingNearest2d  #DEFINE_ALIAS
 from .common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .common import Dropout  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index d8e1d03b02840e76ff865986d8b90ca9d6cdd9f8..a1923542c40828b0886e9f5ab1f97e04e94fec92 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -26,7 +26,7 @@ __all__ = [
     'Pool2D',
     'Embedding',
     'Linear',
-    'UpSample',
+    'Upsample',
     'Pad2D',
     'UpsamplingNearest2d',
     'UpsamplingBilinear2d',
@@ -131,12 +131,15 @@ class Linear(layers.Layer):
         return out
 
 
-class UpSample(layers.Layer):
+class Upsample(layers.Layer):
     """
     This op resizes a batch of images.
+
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    Where in_w is width of the input tensor, in_h is the height of the input tensor,
+    in_d is the depth of the intput tensor.
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
@@ -171,6 +174,12 @@ class UpSample(layers.Layer):
     align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
 
+    Area interpolation is to perform area interpolation
+    in both the 3rd dimension(in height direction) , the 4th dimension(in width
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or
+    `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
+
     Example:
 
     .. code-block:: text
@@ -273,9 +282,9 @@ class UpSample(layers.Layer):
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
+        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`. Has to match input size if it is either a list or a tuple or a Tensor.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearst', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
@@ -322,7 +331,7 @@ class UpSample(layers.Layer):
             paddle.disable_static()
 
             input_data = np.random.rand(2,3,6,10).astype("float32")
-            upsample_out  = paddle.nn.UpSample(size=[12,12])
+            upsample_out  = paddle.nn.Upsample(size=[12,12])
 
             input = paddle.to_tensor(input_data)
             output = upsample_out(x=input)
@@ -339,7 +348,7 @@ class UpSample(layers.Layer):
                  align_mode=0,
                  data_format='NCHW',
                  name=None):
-        super(UpSample, self).__init__()
+        super(Upsample, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
         self.mode = mode.lower()
@@ -366,7 +375,8 @@ class UpsamplingNearest2d(layers.Layer):
     """
     This op upsamples a batch of images, using nearest neighbours' pixel values.
     The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
-    and the upsampling only applies on the two dimensions(height and width).
+    where in_w is width of the input tensor, in_h is the height of the input tensor.
+    And the upsampling only applies on the two dimensions(height and width).
 
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
@@ -381,10 +391,11 @@ class UpsamplingNearest2d(layers.Layer):
              layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
-             Default: None. Has to match input size if it is a list.
+        scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.
+             Has to match input size if it is either a list or a tuple or a Tensor.
+             Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
             `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -449,7 +460,8 @@ class UpsamplingBilinear2d(layers.Layer):
     """
     This op upsamples a batch of images, using bilinear' pixel values.
     The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
-    and the upsampling only applies on the two dimensions(height and width).
+    where in_w is width of the input tensor, in_h is the height of the input tensor.
+    And the upsampling only applies on the two dimensions(height and width).
 
     Bilinear interpolation is an extension of linear interpolation for
     interpolating functions of two variables (e.g. H-direction and
@@ -466,10 +478,11 @@ class UpsamplingBilinear2d(layers.Layer):
              layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
-             Default: None. Has to match input size if it is a list.
+        scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.
+             Has to match input size if it is either a list or a tuple or a Tensor.
+             Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
             `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 4e342c00528a2c0115940bb7f695e1ed5b582382..a610693a0a46b7e21d2c6d83716a7bc029677583 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -113,7 +113,7 @@ class _ConvNd(layers.Layer):
             attr=self._bias_attr, shape=[self._out_channels], is_bias=True)
 
 
-class Conv1d(layers.Layer):
+class Conv1d(_ConvNd):
     """
     This interface is used to construct a callable object of the ``Conv1d`` class.
     For more details, refer to code examples.
@@ -172,8 +172,7 @@ class Conv1d(layers.Layer):
             When in 'replicate' mode, uses input boundaries to pad the input tensor.
             When in 'circular' mode, uses circular input to pad the input tensor.
             Default is 'zeros'.
-        bias(bool, optional): Whether to use bias. Default: True.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv1d. If it is set to None or one attribute of ParamAttr, conv1d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
@@ -227,205 +226,15 @@ class Conv1d(layers.Layer):
                  dilation=1,
                  groups=1,
                  padding_mode='zeros',
-                 bias=True,
                  weight_attr=None,
                  bias_attr=None,
-                 data_format="NCL",
-                 name=None):
-        super(Conv1d, self).__init__()
-        assert weight_attr is not False, "param_attr should not be False here."
-        self._in_channels = in_channels
-        self._out_channels = out_channels
-        self._groups = groups
-        if in_channels % groups != 0:
-            raise ValueError("in_channels must be divisible by groups.")
-        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
-        self._stride = utils.convert_to_list(stride, 1, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
-        self._padding = padding  # leave it to F.conv1d
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._data_format = data_format
-        self._name = name
-
-        self._padding_mode = padding_mode
-
-        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
-        if padding_mode not in valid_padding_modes:
-            raise ValueError(
-                "padding_mode must be one of {}, but got padding_mode='{}'".
-                format(valid_padding_modes, padding_mode))
-
-        if padding_mode in {'reflect', 'replicate', 'circular'
-                            } and not isinstance(padding, np.int):
-            raise ValueError(
-                "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
-            )
-        if not isinstance(padding, str):
-            self._padding = utils.convert_to_list(padding, 1, 'padding') * 2
-
-        num_filter_channels = in_channels // groups
-        filter_shape = [self._out_channels, num_filter_channels
-                        ] + self._kernel_size
-
-        self.weight = self.create_parameter(
-            attr=self._weight_attr,
-            shape=filter_shape,
-            default_initializer=_get_default_param_initializer(
-                self._in_channels, filter_shape))
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=[self._out_channels],
-            is_bias=True) if bias else None
-
-    def forward(self, x):
-        padding = 0
-        if self._padding_mode != "zeros":
-            x = F.pad(x,
-                      self._padding,
-                      mode=self._padding_mode,
-                      data_format=self._data_format)
-        else:
-            padding = self._padding
-
-        out = F.conv1d(
-            x,
-            self.weight,
-            bias=self.bias,
-            padding=padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            data_format=self._data_format,
-            name=self._name)
-        return out
-
-
-class Conv2d(_ConvNd):
-    """
-    This interface is used to construct a callable object of the ``Conv2d`` class.
-    For more details, refer to code examples.
-    The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input and
-    Output are in NCHW format, where N is batch size, C is the number of
-    the feature map, H is the height of the feature map, and W is the width of the feature map.
-    Filter's shape is [MCHW] , where M is the number of output feature map,
-    C is the number of input feature map, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input feature map divided by the groups.
-    Please refer to UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
-    for more details.
-    If bias attribution and activation type are provided, bias is added to the
-    output of the convolution, and the corresponding activation function is
-    applied to the final result.
-    For each input :math:`X`, the equation is:
-
-    ..  math::
-
-        Out = \sigma (W \\ast X + b)
-
-    Where:
-
-    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
-    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    
-    Parameters:
-        in_channels(int): The number of input channels in the input image.
-        out_channels(int): The number of output channels produced by the convolution.
-        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain three integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. The default value is 1.
-        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
-            1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
-            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
-            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
-            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
-            The default value is 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
-            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. The default value is 1.
-        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
-        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
-            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If it is set to None, the parameter
-            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
-        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. The default value is None.
-        data_format(str, optional): Data format that specifies the layout of input.
-            It can be "NCHW" or "NHWC". Default: "NCHW".
-
-    Attribute:
-
-        **weight** (Parameter): the learnable weights of filter of this layer.
-
-        **bias** (Parameter or None): the learnable bias of this layer.
-
-    Shape:
-
-        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        ..  math::
-
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
-
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
-
-    Examples:
-
-        .. code-block:: python
-
-          import numpy as np
-          import paddle
-          import paddle.nn as nn
-          x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          
-          paddle.disable_static()
-          x_var = paddle.to_tensor(x)
-          conv = nn.Conv2d(4, 6, (3, 3))
-          y_var = conv(x_var)
-          y_np = y_var.numpy()
-          print(y_np.shape)
-          
-          # (2, 6, 6, 6)
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCHW"):
-        super(Conv2d, self).__init__(
+                 data_format="NCL"):
+        super(Conv1d, self).__init__(
             in_channels,
             out_channels,
             kernel_size,
             False,
-            2,
+            1,
             stride=stride,
             padding=padding,
             padding_mode=padding_mode,
@@ -436,25 +245,20 @@ class Conv2d(_ConvNd):
             data_format=data_format)
 
     def forward(self, x):
-        if self._padding_mode != 'zeros':
+        padding = 0
+        if self._padding_mode != "zeros":
             x = F.pad(x,
-                      self._reversed_padding_repeated_twice,
+                      self._padding,
                       mode=self._padding_mode,
                       data_format=self._data_format)
-            return F.conv2d(
-                x,
-                self.weight,
-                bias=self.bias,
-                stride=self._stride,
-                dilation=self._dilation,
-                groups=self._groups,
-                data_format=self._data_format)
+        else:
+            padding = self._padding
 
-        out = F.conv2d(
+        out = F.conv1d(
             x,
             self.weight,
             bias=self.bias,
-            padding=self._padding,
+            padding=padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
@@ -462,7 +266,7 @@ class Conv2d(_ConvNd):
         return out
 
 
-class ConvTranspose1d(layers.Layer):
+class ConvTranspose1d(_ConvNd):
     """
     This interface is used to construct a callable object of the ``ConvTranspose1d`` class.
     For more details, refer to code examples.
@@ -603,34 +407,24 @@ class ConvTranspose1d(layers.Layer):
                  padding=0,
                  output_padding=0,
                  groups=1,
-                 bias=True,
                  dilation=1,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCL"):
-        super(ConvTranspose1d, self).__init__()
-        assert weight_attr is not False, "param_attr should not be False in ConvTranspose1d."
-        self._param_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._groups = groups
-        self._in_channels = in_channels
-        self._out_channels = out_channels
-        self._output_padding = output_padding
-        self._data_format = data_format
-        self._bias = bias
-
-        self._stride = utils.convert_to_list(stride, 1, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
-        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
-        self._padding = padding
-
-        filter_shape = [self._in_channels, out_channels // groups
-                        ] + self._kernel_size
-        self.weight = self.create_parameter(
-            shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=[self._out_channels],
-            is_bias=True) if self._bias else None
+        super(ConvTranspose1d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            1,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
 
     def forward(self, x, output_size=None):
         out = F.conv_transpose1d(
@@ -638,7 +432,169 @@ class ConvTranspose1d(layers.Layer):
             self.weight,
             bias=self.bias,
             output_size=output_size,
-            output_padding=self._output_padding,
+            output_padding=self.output_padding,
+            padding=self._padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format)
+        return out
+
+
+class Conv2d(_ConvNd):
+    """
+    This interface is used to construct a callable object of the ``Conv2d`` class.
+    For more details, refer to code examples.
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
+    the feature map, H is the height of the feature map, and W is the width of the feature map.
+    Filter's shape is [MCHW] , where M is the number of output feature map,
+    C is the number of input feature map, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input feature map divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more details.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+    For each input :math:`X`, the equation is:
+
+    ..  math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
+    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    
+    Parameters:
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. The default value is 1.
+        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+            1. a string in ['valid', 'same'].
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
+            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
+            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
+            The default value is 0.
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. The default value is 1.
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. The default value is None.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It can be "NCHW" or "NHWC". Default: "NCHW".
+
+    Attribute:
+
+        **weight** (Parameter): the learnable weights of filter of this layer.
+
+        **bias** (Parameter or None): the learnable bias of this layer.
+
+    Shape:
+
+        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        ..  math::
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
+    Examples:
+
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+          import paddle.nn as nn
+          x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
+          
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.Conv2d(4, 6, (3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
+          
+          # (2, 6, 6, 6)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW"):
+        super(Conv2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            2,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x):
+        if self._padding_mode != 'zeros':
+            x = F.pad(x,
+                      self._reversed_padding_repeated_twice,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+            return F.conv2d(
+                x,
+                self.weight,
+                bias=self.bias,
+                stride=self._stride,
+                dilation=self._dilation,
+                groups=self._groups,
+                data_format=self._data_format)
+
+        out = F.conv2d(
+            x,
+            self.weight,
+            bias=self.bias,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
@@ -920,8 +876,8 @@ class Conv3d(_ConvNd):
                  in_channels,
                  out_channels,
                  kernel_size,
-                 padding=0,
                  stride=1,
+                 padding=0,
                  dilation=1,
                  groups=1,
                  padding_mode='zeros',
@@ -1128,7 +1084,7 @@ class ConvTranspose3d(_ConvNd):
             bias_attr=bias_attr,
             data_format=data_format)
 
-    def forward(self, x, output_size):
+    def forward(self, x, output_size=None):
         if output_size is None:
             output_padding = self.output_padding
         else:
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 8bdb09c76918d747d644a8c781dabb6aab41522c..2000fbf388f88d1da7119402104706a433cebf06 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -811,7 +811,7 @@ class BatchNorm2d(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW". Default: NCHW.
         track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
             True will track global mean and variance used for inference. When inference, track_running_stats must be 
             True. Default: True.
@@ -844,10 +844,10 @@ class BatchNorm2d(_BatchNormBase):
     """
 
     def _check_data_format(self, input):
-        if input == 'NCHW' or input == 'NCWH':
+        if input == 'NCHW':
             self._data_format = input
         else:
-            raise ValueError('expected NCHW or NCWH for data_format input')
+            raise ValueError('expected NCHW for data_format input')
 
     def _check_input_dim(self, input):
         if len(input.shape) != 4:
@@ -1130,10 +1130,10 @@ class SyncBatchNorm(_BatchNormBase):
         """
         layer_output = layer
         if isinstance(layer, _BatchNormBase):
-            layer_output = SyncBatchNorm(layer._num_features, layer._epsilon,
-                                         layer._momentum, layer._weight_attr,
-                                         layer._bias_attr, layer._data_format,
-                                         layer._name)
+            layer_output = SyncBatchNorm(
+                layer._num_features, layer._momentum, layer._epsilon,
+                layer._weight_attr, layer._bias_attr, layer._data_format,
+                layer._track_running_stats, layer._name)
 
             if layer._weight_attr != False and layer._bias_attr != False:
                 with no_grad():
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 4cb661cf541222ec4f05df0fdc69b6483f04cf55..129dae93b38327308263550e73031b607b2eacc3 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -87,6 +87,7 @@ class AvgPool1d(layers.Layer):
     Examples:
 
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           paddle.disable_static()
@@ -176,6 +177,7 @@ class AvgPool2d(layers.Layer):
         ShapeError: If the output's shape calculated is not greater than 0.
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -267,6 +269,7 @@ class AvgPool3d(layers.Layer):
 
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -457,6 +460,7 @@ class MaxPool2d(layers.Layer):
 
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -547,6 +551,7 @@ class MaxPool3d(layers.Layer):
 
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -849,7 +854,7 @@ class AdaptiveMaxPool1d(layers.Layer):
 
        lend &= ceil((i + 1) * L_{in} / L_{out})
 
-       Output(i) &= max(Input[lstart:lend])}
+       Output(i) &= max(Input[lstart:lend])
 
     Args:
         output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
@@ -915,8 +920,11 @@ class AdaptiveMaxPool2d(layers.Layer):
     """
     This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+
     For adaptive max pool2d:
+
     ..  math::
+
        hstart &= floor(i * H_{in} / H_{out})
        hend &= ceil((i + 1) * H_{in} / H_{out})
        wstart &= floor(j * W_{in} / W_{out})
@@ -931,11 +939,12 @@ class AdaptiveMaxPool2d(layers.Layer):
     Shape:
         x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
         output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x.
-    
+
     Returns:
         A callable object of AdaptiveMaxPool2d.
     Examples:
         .. code-block:: python
+
             # adaptive max pool2d
             # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
             # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
@@ -976,10 +985,13 @@ class AdaptiveMaxPool2d(layers.Layer):
 
 class AdaptiveMaxPool3d(layers.Layer):
     """
-   This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
+    This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+
     For adaptive max pool3d:
+
     ..  math::
+
       dstart &= floor(i * D_{in} / D_{out})
       dend &= ceil((i + 1) * D_{in} / D_{out})
       hstart &= floor(j * H_{in} / H_{out})
@@ -987,10 +999,9 @@ class AdaptiveMaxPool3d(layers.Layer):
       wstart &= floor(k * W_{in} / W_{out})
       wend &= ceil((k + 1) * W_{in} / W_{out})
       Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
+
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
-            the size will be the same as that of the input.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
         return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
@@ -1002,6 +1013,7 @@ class AdaptiveMaxPool3d(layers.Layer):
         A callable object of AdaptiveMaxPool3d.
     Examples:
         .. code-block:: python
+
             # adaptive max pool3d
             # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
             # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
@@ -1028,10 +1040,10 @@ class AdaptiveMaxPool3d(layers.Layer):
             pool = paddle.nn.AdaptiveMaxPool3d(output_size=4)
             out = pool(x)
             # out shape: [2, 3, 4, 4, 4]
-            pool, indices = paddle.nn.AdaptiveMaxPool3d(output_size=3, return_indices=True)
-            out = pool(x)
+            pool = paddle.nn.AdaptiveMaxPool3d(output_size=3, return_indices=True)
+            out, indices = pool(x)
             # out shape: [2, 3, 4, 4, 4], indices shape: [2, 3, 4, 4, 4]
-            
+
     """
 
     def __init__(self, output_size, return_indices=False, name=None):
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index ad53bf394660f3a7e0e48fdbd5eb530abd0852bb..7a21e7661d4e78d0004996ee67c80ddc35006bc3 100644
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -112,6 +112,14 @@ class WeightNorm(object):
         if dim is None:
             dim = -1
 
+        # support dim is negative numeber, (dim = -1) == (dim = None)
+        weight_dim = len(layer._parameters[name].shape)
+        assert (
+            dim < weight_dim and dim >= -1 * weight_dim
+        ), "dim must set between [-R, R), R means the dimension of weight."
+        if dim != -1:
+            dim = (dim + weight_dim) % weight_dim
+
         fn = WeightNorm(name, dim)
 
         w = getattr(layer, name)
diff --git a/python/paddle/optimizer/lr_scheduler.py b/python/paddle/optimizer/lr_scheduler.py
index 4ecaffb8fa509bdc54067bb25f8d1b5191b7ac1b..61391704061bda7dfbad7252cbc04c0b7d6492a4 100644
--- a/python/paddle/optimizer/lr_scheduler.py
+++ b/python/paddle/optimizer/lr_scheduler.py
@@ -109,7 +109,7 @@ class _LRScheduler(object):
         """
         self.keys = ['last_epoch', 'last_lr']
 
-    def set_dict(self, state_dict):
+    def set_state_dict(self, state_dict):
         """
         Loads the schedulers state.
         """
@@ -126,8 +126,8 @@ class _LRScheduler(object):
                 "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
             )
 
-    # alias for set_dict
-    set_state_dict = set_dict
+    # alias for set_state_dict
+    set_dict = set_state_dict
 
     def get_lr(self):
         # calculate by python float
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 2f7bc94e646324b849b0308b219261f56eba1e28..1bd9a1f144ed4b5c69d76070eadc317e2063e25b 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -169,7 +169,7 @@ class Optimizer(object):
 
                 import paddle
                 paddle.disable_static()
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
 
                 adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
                 state_dict = adam.state_dict()
@@ -199,7 +199,7 @@ class Optimizer(object):
 
                 import paddle
                 paddle.disable_static()
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
 
                 state_dict = emb.state_dict()
                 paddle.framework.save(state_dict, "paddle_dy")
@@ -371,7 +371,7 @@ class Optimizer(object):
                 import paddle
                 # example1: _LRScheduler is not used, return value is all the same
                 paddle.disable_static()
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
                 adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
                 lr = adam.get_lr()
                 print(lr) # 0.001
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 9ef66712540aa54eac39b7e6160c5c91b6e3fcd5..9eece1240d7d3c0b8a863091367e993047bd4527 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -73,8 +73,8 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
         dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8'. And
-            'complex64' , 'complex128' only for ComplexTensor. Default: None, for float point number, 
-            get type from ``get_default_type``, for other type, infers from ``data`` .
+            'complex64' , 'complex128' only for ComplexTensor. Default: None, infers dtype from ``data`` 
+            except for python float number which gets dtype from ``get_default_type`` .
         place(CPUPlace|CUDAPinnedPlace|CUDAPlace, optional): The place to allocate Tensor. Can be  
             CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
@@ -188,13 +188,21 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             raise TypeError(
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor|paddle.ComplexTensor".
                 format(type(data)))
+        if not dtype and data.dtype in [
+                'float16', 'float32', 'float64', 'complex64', 'complex128'
+        ]:
+            default_type = paddle.get_default_dtype()
+            if np.iscomplexobj(data):
+                default_type = 'complex64' if default_type in [
+                    'float16', 'float32'
+                ] else 'complex128'
+            data = data.astype(default_type)
+
+    if dtype and convert_dtype(dtype) != data.dtype:
+        data = data.astype(dtype)
 
     if not np.iscomplexobj(data):
-        if dtype:
-            dtype = convert_dtype(dtype)
-        elif data.dtype in ['float16', 'float32', 'float64']:
-            dtype = paddle.framework.get_default_dtype()
-        if dtype and dtype != data.dtype:
+        if dtype and convert_dtype(dtype) != data.dtype:
             data = data.astype(dtype)
         return paddle.Tensor(
             value=data,
@@ -203,14 +211,6 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             zero_copy=True,
             stop_gradient=stop_gradient)
     else:
-        if dtype:
-            dtype = convert_dtype(dtype)
-        else:
-            dtype = paddle.framework.get_default_dtype()
-            dtype = 'complex64' if dtype in ['float16', 'float32'
-                                             ] else 'complex128'
-        if dtype != data.dtype:
-            data = data.astype(dtype)
         name = unique_name.generate('generated_tensor')
         real_tensor = paddle.Tensor(
             value=data.real,
@@ -244,10 +244,6 @@ def full_like(x, fill_value, dtype=None, name=None):
     Returns:
         Tensor: Tensor which is created according to ``x``, ``fill_value`` and ``dtype``.
     
-    Raises:
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64 and None.
-    
     Examples:
         .. code-block:: python
 
@@ -303,11 +299,6 @@ def ones(shape, dtype=None, name=None):
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 1.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64 and None.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
-    
     Examples:
         .. code-block:: python
 
@@ -366,11 +357,10 @@ def ones_like(x, dtype=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            x = paddle.to_tensor([1,2,3])
             out1 = paddle.zeros_like(x) # [1., 1., 1.]
             out2 = paddle.zeros_like(x, dtype='int32') # [1, 1, 1]
 
@@ -392,11 +382,6 @@ def zeros(shape, dtype=None, name=None):
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 0.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64 and None.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
-    
     Examples:
         .. code-block:: python
 
@@ -453,11 +438,10 @@ def zeros_like(x, dtype=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            x = paddle.to_tensor([1,2,3])
             out1 = paddle.zeros_like(x) # [0., 0., 0.]
             out2 = paddle.zeros_like(x, dtype='int32') # [0, 0, 0]
 
@@ -482,10 +466,6 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
 
     Returns:
         Tensor: An identity Tensor or LoDTensor of shape [num_rows, num_columns].
-    
-    Raises:
-        TypeError: The ``dtype`` must be one of float16, float32, float64, int32 int64 and None.
-        TypeError: The ``num_columns`` must be non-negative int.
 
     Examples:
         .. code-block:: python
@@ -534,11 +514,6 @@ def full(shape, fill_value, dtype=None, name=None):
     Returns:
         Tensor: Tensor which is created according to ``shape``, ``fill_value`` and ``dtype``.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of None, bool, float16, float32, float64, int32 and int64.
-        TypeError: The ``shape`` must be one of Tensor, list and tuple. The data type of ``shape`` must
-            be int32 or int64 when the it's a Tensor
-    
     Examples:
         .. code-block:: python
 
@@ -619,7 +594,6 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         .. code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
 
@@ -633,7 +607,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         out3 = paddle.arange(4.999, dtype='float32')
         # [0., 1., 2., 3., 4.]
 
-        start_var = paddle.to_tensor(np.array([3]))
+        start_var = paddle.to_tensor([3])
         out4 = paddle.arange(start_var, 7)
         # [3, 4, 5, 6]
              
@@ -725,7 +699,7 @@ def tril(x, diagonal=0, name=None):
 
             paddle.disable_static()
 
-            x = paddle.to_variable(data)
+            x = paddle.to_tensor(data)
             
             tril1 = paddle.tensor.tril(x)
             # array([[ 1,  0,  0,  0],
@@ -797,7 +771,7 @@ def triu(x, diagonal=0, name=None):
             paddle.disable_static()
 
             # example 1, default diagonal
-            x = paddle.to_variable(data)
+            x = paddle.to_tensor(data)
             triu1 = paddle.tensor.triu(x)
             # array([[ 1,  2,  3,  4],
             #        [ 0,  6,  7,  8],
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index b5b528325cd9f52a8b61ef21df0095c41da5a8ed..7ddda5091a0a260f56b29bcedfdcb0786e82ddd6 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -810,7 +810,7 @@ def cholesky(x, upper=False, name=None):
             a = np.random.rand(3, 3)
             a_t = np.transpose(a, [1, 0])
             x_data = np.matmul(a, a_t) + 1e-03
-            x = paddle.to_variable(x_data)
+            x = paddle.to_tensor(x_data)
             out = paddle.cholesky(x, upper=False)
             print(out.numpy())
             # [[1.190523   0.         0.        ]
@@ -855,15 +855,16 @@ def bmm(x, y, name=None):
     Examples:
         import paddle
 
-        # In imperative mode:
-        # size input1: (2, 2, 3) and input2: (2, 3, 2)
-        input1 = np.array([[[1.0, 1.0, 1.0],[2.0, 2.0, 2.0]],[[3.0, 3.0, 3.0],[4.0, 4.0, 4.0]]])
-        input2 = np.array([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],[[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
-
         paddle.disable_static()
-        
-        x = paddle.to_variable(input1)
-        y = paddle.to_variable(input2)
+
+        # In imperative mode:
+        # size x: (2, 2, 3) and y: (2, 3, 2)
+        x = paddle.to_tensor([[[1.0, 1.0, 1.0],
+                               [2.0, 2.0, 2.0]],
+                              [[3.0, 3.0, 3.0],
+                               [4.0, 4.0, 4.0]]])
+        y = paddle.to_tensor([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],
+                              [[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
         out = paddle.bmm(x, y)
         #output size: (2, 2, 2)
         #output value:
@@ -924,10 +925,8 @@ def histogram(input, bins=100, min=0, max=0):
     Code Example 2:
         .. code-block:: python
             import paddle
-            import numpy as np
             paddle.disable_static(paddle.CPUPlace())
-            inputs_np = np.array([1, 2, 1]).astype(np.float)
-            inputs = paddle.to_variable(inputs_np)
+            inputs = paddle.to_tensor([1, 2, 1])
             result = paddle.histogram(inputs, bins=4, min=0, max=3)
             print(result) # [0, 2, 1, 0]
             paddle.enable_static()
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 36b558d597c1ce1333a8f1eec54e2fd2813625e3..5fd714421c8ed14820738543a1824c779296d7c3 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -71,13 +71,12 @@ def equal_all(x, y, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
 
           paddle.disable_static()
-          x = paddle.to_variable(np.array([1, 2, 3]))
-          y = paddle.to_variable(np.array([1, 2, 3]))
-          z = paddle.to_variable(np.array([1, 4, 3]))
+          x = paddle.to_tensor([1, 2, 3])
+          y = paddle.to_tensor([1, 2, 3])
+          z = paddle.to_tensor([1, 4, 3])
           result1 = paddle.equal_all(x, y)
           print(result1.numpy()) # result1 = [True ]
           result2 = paddle.equal_all(x, z)
@@ -120,14 +119,11 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
           paddle.disable_static()
 
-          np_x = np.array([10000., 1e-07]).astype("float32")
-          np_y = np.array([10000.1, 1e-08]).astype("float32")
-          x = paddle.to_tensor(np_x)
-          y = paddle.to_tensor(np_y)
+          x = paddle.to_tensor([10000., 1e-07])
+          y = paddle.to_tensor([10000.1, 1e-08])
           result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
           np_result1 = result1.numpy()
@@ -137,10 +133,8 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
           np_result2 = result2.numpy()
           # [False]
 
-          np_x = np.array([1.0, float('nan')]).astype("float32")
-          np_y = np.array([1.0, float('nan')]).astype("float32")
-          x = paddle.to_tensor(np_x)
-          y = paddle.to_tensor(np_y)
+          x = paddle.to_tensor([1.0, float('nan')])
+          y = paddle.to_tensor([1.0, float('nan')])
           result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
           np_result1 = result1.numpy()
@@ -195,12 +189,11 @@ def equal(x, y, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
 
           paddle.disable_static()
-          x = paddle.to_variable(np.array([1, 2, 3]))
-          y = paddle.to_variable(np.array([1, 3, 2]))
+          x = paddle.to_tensor([1, 2, 3])
+          y = paddle.to_tensor([1, 3, 2])
           result1 = paddle.equal(x, y)
           print(result1.numpy())  # result1 = [True False False]
     """
@@ -227,12 +220,11 @@ def greater_equal(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.greater_equal(x, y)
             print(result1.numpy())  # result1 = [True False True]
     """
@@ -259,12 +251,11 @@ def greater_than(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.greater_than(x, y)
             print(result1.numpy())  # result1 = [False False True]
     """
@@ -292,12 +283,11 @@ def less_equal(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.less_equal(x, y)
             print(result1.numpy())  # result1 = [True True False]
     """
@@ -325,12 +315,11 @@ def less_than(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.less_than(x, y)
             print(result1.numpy())  # result1 = [False True False]
     """
@@ -358,12 +347,12 @@ def not_equal(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
+
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.not_equal(x, y)
             print(result1.numpy())  # result1 = [False True True]
     """
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 5a01fff88c16bfa584479d71ea93d78999de40df..db1222fa421ef61e6f68f0d69ad0fe7f5d80f6d5 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -85,11 +85,6 @@ def concat(x, axis=0, name=None):
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
-    Raises:
-        TypeError: ``x`` must be list or tuple.
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32 and int64. 
-        TypeError: The ``axis`` must be int or Tensor. The dtype of ``axis`` must be int32 or int64 when it's a Tensor.
-        TypeError: All the Tensors in ``x`` must have the same data type.
 
     Returns:
         Tensor: A Tensor with the same data type as ``x``.
@@ -98,18 +93,14 @@ def concat(x, axis=0, name=None):
         .. code-block:: python
             
             import paddle
-            import numpy as np
             
             paddle.disable_static()  # Now we are in imperative mode
-            in1 = np.array([[1, 2, 3],
-                            [4, 5, 6]])
-            in2 = np.array([[11, 12, 13],
-                            [14, 15, 16]])
-            in3 = np.array([[21, 22],
-                            [23, 24]])
-            x1 = paddle.to_tensor(in1)
-            x2 = paddle.to_tensor(in2)
-            x3 = paddle.to_tensor(in3)
+            x1 = paddle.to_tensor([[1, 2, 3],
+                                   [4, 5, 6]])
+            x2 = paddle.to_tensor([[11, 12, 13],
+                                   [14, 15, 16]])
+            x3 = paddle.to_tensor([[21, 22],
+                                   [23, 24]])
             zero = paddle.full(shape=[1], dtype='int32', fill_value=0)
             # When the axis is negative, the real axis is (axis + Rank(x))
             # As follow, axis is -1, Rank(x) is 2, the real axis is 1
@@ -158,7 +149,7 @@ def flip(x, axis, name=None):
           image_shape=(3, 2, 2)
           x = np.arange(image_shape[0] * image_shape[1] * image_shape[2]).reshape(image_shape)
           x = x.astype('float32')
-          img = paddle.to_variable(x)
+          img = paddle.to_tensor(x)
           out = paddle.flip(img, [0,1])
 
           print(out) # [[[10,11][8, 9]],[[6, 7],[4, 5]] [[2, 3],[0, 1]]]
@@ -250,7 +241,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
             x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]).reshape(image_shape) / 100.
             x = x.astype('float32')
             
-            img = paddle.to_variable(x)
+            img = paddle.to_tensor(x)
             out = paddle.flatten(img, start_axis=1, stop_axis=2)
             # out shape is [2, 12, 4]
     """
@@ -315,15 +306,13 @@ def roll(x, shifts, axis=None, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
             import paddle.fluid as fluid
 
-            data = np.array([[1.0, 2.0, 3.0],
-                             [4.0, 5.0, 6.0],
-                             [7.0, 8.0, 9.0]])
             paddle.disable_static()
-            x = paddle.to_variable(data)
+            x = paddle.to_tensor([[1.0, 2.0, 3.0],
+                                  [4.0, 5.0, 6.0],
+                                  [7.0, 8.0, 9.0]])
             out_z1 = paddle.roll(x, shifts=1)
             print(out_z1.numpy())
             #[[9. 1. 2.]
@@ -433,8 +422,7 @@ def stack(x, axis=0, name=None):
                           [5.0, 6.0] ] ]
 
     Args:
-        x (Tensor|list[Tensor]|tuple[Tensor]): Input ``x`` can be a single tensor, or a ``list`` or ``tuple`` of tensors.
-                                     If ``x`` is a ``list`` or ``tuple`` , the Tensors in ``x``
+        x (list[Tensor]|tuple[Tensor]): Input ``x`` can be a ``list`` or ``tuple`` of tensors, the Tensors in ``x``
                                      must be of the same shape and dtype. Supported data types: float32, float64, int32, int64.
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
                               where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
@@ -448,17 +436,11 @@ def stack(x, axis=0, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-
-            data1 = np.array([[1.0, 2.0]])
-            data2 = np.array([[3.0, 4.0]])
-            data3 = np.array([[5.0, 6.0]])
-
+            
             paddle.disable_static()
-            x1 = paddle.to_variable(data1)
-            x2 = paddle.to_variable(data2)
-            x3 = paddle.to_variable(data3)
-
+            x1 = paddle.to_tensor([[1.0, 2.0]])
+            x2 = paddle.to_tensor([[3.0, 4.0]])
+            x3 = paddle.to_tensor([[5.0, 6.0]])
             out = paddle.stack([x1, x2, x3], axis=0)
             print(out.shape)  # [3, 1, 2]
             print(out.numpy())
@@ -487,10 +469,7 @@ def split(x, num_or_sections, axis=0, name=None):
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    Raises:
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: ``num_or_sections`` is not int, list or tuple.
-        TypeError: ``axis`` is not int or Tensor. the data type of ``axis`` must be int32 or int64 when it's a Tensor.
+    
     Example:
         .. code-block:: python
             
@@ -638,12 +617,10 @@ def unique(x,
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x_data = np.array([2, 3, 3, 1, 5, 3])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([2, 3, 3, 1, 5, 3])
             unique = paddle.unique(x)
             np_unique = unique.numpy() # [1 2 3 5]
             _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
@@ -651,8 +628,7 @@ def unique(x,
             np_inverse = inverse.numpy() # [1 2 2 0 3 2]
             np_counts = counts.numpy() # [1 1 3 1]
 
-            x_data = np.array([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
             unique = paddle.unique(x)
             np_unique = unique.numpy() # [0 1 2 3]
 
@@ -770,8 +746,6 @@ def unsqueeze(x, axis, name=None):
             print(out3.shape)  # [1, 1, 1, 5, 10]
             
     """
-    if isinstance(axis, int):
-        axis = [axis]
 
     return layers.unsqueeze(x, axis, name)
 
@@ -812,23 +786,15 @@ def gather(x, index, axis=None, name=None):
     Returns:
         output (Tensor): The output is a tensor with the same rank as ``x``.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
-        TypeError: ``axis`` must be a Tensor or int and the data type of ``index`` must be int32 or int64 when it's a Tensor.
-
     Examples:
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            input_1 = np.array([[1,2],[3,4],[5,6]])
-            index_1 = np.array([0,1])
-            input = paddle.to_tensor(input_1)
-            index = paddle.to_tensor(index_1)
+            input = paddle.to_tensor([[1,2],[3,4],[5,6]])
+            index = paddle.to_tensor([0,1])
             output = paddle.gather(input, index, axis=0)
             # expected output: [[1,2],[3,4]]
     """
@@ -964,16 +930,11 @@ def scatter(x, index, updates, overwrite=True, name=None):
         .. code-block:: python
             
             import paddle
-            import numpy as np
             paddle.disable_static()
 
-            x_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float32)
-            index_data = np.array([2, 1, 0, 1]).astype(np.int64)
-            updates_data = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float32)
-            
-            x = paddle.to_tensor(x_data)
-            index = paddle.to_tensor(index_data)
-            updates = paddle.to_tensor(updates_data)
+            x = paddle.to_tensor([[1, 1], [2, 2], [3, 3]], dtype='float32')
+            index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+            updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
   
             output1 = paddle.scatter(x, index, updates, overwrite=False)
             # [[3., 3.],
@@ -1026,10 +987,7 @@ def chunk(x, chunks, axis=0, name=None):
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    Raises:
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: ``chunks`` is not int.
-        TypeError: ``axis`` is not int or Tensor. the data type of ``axis`` must be int32 or int64 when it's a Tensor.
+    
     Example:
         .. code-block:: python
             
@@ -1041,7 +999,7 @@ def chunk(x, chunks, axis=0, name=None):
             x_np = np.random.random([3, 9, 5]).astype("int32")
             x = paddle.to_tensor(x_np)
 
-            out0, out1, out22 = paddle.chunk(x, chunks=3, axis=1)
+            out0, out1, out2 = paddle.chunk(x, chunks=3, axis=1)
             # out0.shape [3, 3, 5]
             # out1.shape [3, 3, 5]
             # out2.shape [3, 3, 5]
@@ -1080,11 +1038,9 @@ def tile(x, repeat_times, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            np_data = np.array([1, 2, 3]).astype('int32')
-            data = paddle.to_tensor(np_data)
+            data = paddle.to_tensor([1, 2, 3], dtype='int32')
             out = paddle.tile(data, repeat_times=[2, 1])
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
@@ -1093,8 +1049,7 @@ def tile(x, repeat_times, name=None):
             np_out = out.numpy()
             # [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]]
 
-            np_repeat_times = np.array([2, 1]).astype("int32")
-            repeat_times = paddle.to_tensor(np_repeat_times)
+            repeat_times = paddle.to_tensor([2, 1], dtype='int32')
             out = paddle.tile(data, repeat_times=repeat_times)
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
@@ -1162,15 +1117,12 @@ def expand_as(x, y, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
 
-            np_data_x = np.array([1, 2, 3]).astype('int32')
-            np_data_y = np.array([[1, 2, 3], [4, 5, 6]]).astype('int32')
-            data_x = paddle.to_tensor(np_data_x)
-            data_y = paddle.to_tensor(np_data_y)
+            data_x = paddle.to_tensor([1, 2, 3], 'int32')
+            data_y = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], 'int32')
             out = paddle.expand_as(data_x, data_y)
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
@@ -1218,12 +1170,10 @@ def expand(x, shape, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            np_data = np.array([1, 2, 3]).astype('int32')
-            data = paddle.to_tensor(np_data)
+            data = paddle.to_tensor([1, 2, 3], dtype='int32')
             out = paddle.expand(data, shape=[2, 3])
             out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
@@ -1322,11 +1272,6 @@ def reshape(x, shape, name=None):
     Returns:
         Tensor: A reshaped Tensor with the same data type as ``x``.
 
-    Raises:
-        ValueError: If more than one elements of ``shape`` is -1.
-        ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
-        ValueError: If the elements in ``shape`` is negative except -1.
-
     Examples:
         .. code-block:: python
 
@@ -1413,23 +1358,16 @@ def gather_nd(x, index, name=None):
     Returns:
         output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of float32, float64, int32 and int64.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
-
     Examples:
 
         .. code-block:: python
             
             import paddle
-            import numpy as np
             
             paddle.disable_static()
-            np_x = np.array([[[1, 2], [3, 4], [5, 6]],
-                             [[7, 8], [9, 10], [11, 12]]])
-            np_index = [[0, 1]]
-            x = paddle.to_tensor(np_x)
-            index = paddle.to_tensor(np_index)
+            x = paddle.to_tensor([[[1, 2], [3, 4], [5, 6]],
+                                  [[7, 8], [9, 10], [11, 12]]])
+            index = paddle.to_tensor([[0, 1]])
             
             output = paddle.gather_nd(x, index) #[[3, 4]]
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index b6314ef1ba37937a39073ec68cf1cf540b27bf64..ed2bbe03a366054dfe7d798310c7fa5d419b44a8 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -174,14 +174,12 @@ def pow(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
             
             # example 1: y is a float
-            x_data = np.array([1, 2, 3])
+            x = paddle.to_tensor([1, 2, 3])
             y = 2
-            x = paddle.to_tensor(x_data)
             res = paddle.pow(x, y)
             print(res.numpy()) # [1 4 9]
             
@@ -291,13 +289,10 @@ Examples:
     ..  code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
-        np_x = np.array([2, 3, 4]).astype('float64')
-        np_y = np.array([1, 5, 2]).astype('float64')
-        x = paddle.to_variable(np_x)
-        y = paddle.to_variable(np_y)
+        x = paddle.to_tensor([2, 3, 4], 'float64')
+        y = paddle.to_tensor([1, 5, 2], 'float64')
         z = paddle.add(x, y)
         np_z = z.numpy()
         print(np_z)  # [3., 8., 6. ]
@@ -335,14 +330,11 @@ def divide(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
-            x = paddle.to_tensor(np_x)
-            y = paddle.to_tensor(np_y)
+            x = paddle.to_tensor([2, 3, 4], dtype='float64')
+            y = paddle.to_tensor([1, 5, 2], dtype='float64')
             z = paddle.divide(x, y)
             print(z.numpy())  # [2., 0.6, 2.]
 
@@ -440,14 +432,11 @@ def floor_divide(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            np_x = np.array([2, 3, 8, 7])
-            np_y = np.array([1, 5, 3, 3])
-            x = paddle.to_tensor(np_x)
-            y = paddle.to_tensor(np_y)
+            x = paddle.to_tensor([2, 3, 8, 7])
+            y = paddle.to_tensor([1, 5, 3, 3])
             z = paddle.floor_divide(x, y)
             print(z.numpy())  # [2, 0, 2, 2]
 
@@ -530,14 +519,11 @@ def remainder(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            np_x = np.array([2, 3, 8, 7])
-            np_y = np.array([1, 5, 3, 3])
-            x = paddle.to_tensor(np_x)
-            y = paddle.to_tensor(np_y)
+            x = paddle.to_tensor([2, 3, 8, 7])
+            y = paddle.to_tensor([1, 5, 3, 3])
             z = paddle.remainder(x, y)
             print(z.numpy())  # [0, 3, 2, 1]
 
@@ -612,20 +598,15 @@ def multiply(x, y, axis=-1, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-            y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-            x = paddle.to_tensor(x_data)
-            y = paddle.to_tensor(y_data)
+            x = paddle.to_tensor([[1, 2], [3, 4]])
+            y = paddle.to_tensor([[5, 6], [7, 8]])
             res = paddle.multiply(x, y)
             print(res.numpy()) # [[5, 12], [21, 32]]
 
-            x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-            y_data = np.array([1, 2], dtype=np.float32)
-            x = paddle.to_tensor(x_data)
-            y = paddle.to_tensor(y_data)
+            x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]])
+            y = paddle.to_tensor([1, 2])
             res = paddle.multiply(x, y, axis=1)
             print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
 
@@ -654,36 +635,28 @@ Examples:
 
         paddle.disable_static()
   
-        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[1, 2], [3, 4]])
+        y = paddle.to_tensor([[5, 6], [7, 8]])
         res = paddle.maximum(x, y)
         print(res.numpy())
         #[[5. 6.]
         # [7. 8.]]
 
-        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-        y_data = np.array([1, 2], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]])
+        y = paddle.to_tensor([1, 2])
         res = paddle.maximum(x, y, axis=1)
         print(res.numpy())
         #[[[1. 2. 3.]
         #  [2. 2. 3.]]]
 
-        x_data = np.array([2, 3, 5], dtype=np.float32)
-        y_data = np.array([1, 4, np.nan], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([2, 3, 5], dtype='float32')
+        y = paddle.to_tensor([1, 4, np.nan], dtype='float32')
         res = paddle.maximum(x, y)
         print(res.numpy())
         #[ 2.  4. nan]
 
-        x_data = np.array([5, 3, np.inf], dtype=np.float32)
-        y_data = np.array([1, 4, 5], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([5, 3, np.inf], dtype='float32')
+        y = paddle.to_tensor([1, 4, 5], dtype='float32')
         res = paddle.maximum(x, y)
         print(res.numpy())
         #[ 5.  4. inf]
@@ -703,38 +676,31 @@ Examples:
 
         import paddle
         import numpy as np
+
         paddle.disable_static()
   
-        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32')
+        y = paddle.to_tensor([[5, 6], [7, 8]], dtype='float32')
         res = paddle.minimum(x, y)
         print(res.numpy())
         #[[1. 2.]
         # [3. 4.]]
 
-        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-        y_data = np.array([1, 2], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]], dtype='float32')
+        y = paddle.to_tensor([1, 2], dtype='float32')
         res = paddle.minimum(x, y, axis=1)
         print(res.numpy())
         #[[[1. 1. 1.]
         #  [2. 2. 2.]]]
 
-        x_data = np.array([2, 3, 5], dtype=np.float32)
-        y_data = np.array([1, 4, np.nan], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([2, 3, 5], dtype='float32')
+        y = paddle.to_tensor([1, 4, np.nan], dtype='float32')
         res = paddle.minimum(x, y)
         print(res.numpy())
         #[ 1.  3. nan]
 
-        x_data = np.array([5, 3, np.inf], dtype=np.float32)
-        y_data = np.array([1, 4, 5], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([5, 3, np.inf], dtype='float32')
+        y = paddle.to_tensor([1, 4, 5], dtype='float32')
         res = paddle.minimum(x, y)
         print(res.numpy())
         #[1. 3. 5.]
@@ -794,33 +760,33 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         it's data type is the same as `x`.
 
     Raises:
-        ValueError: The :attr:`dtype` must be float64 or int64.
+        ValueError: If the data type of `x` is float64, :attr:`dtype` can not be float32 or int32.
+        ValueError: If the data type of `x` is int64, :attr:`dtype` can not be int32.
         TypeError: The type of :attr:`axis` must be int, list or tuple.
 
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
             paddle.disable_static()
 
-            # x is a Tensor variable with following elements:
+            # x is a Tensor with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
             # Each example is followed by the corresponding output tensor.
-            x_data = np.array([[0.2, 0.3, 0.5, 0.9],[0.1, 0.2, 0.6, 0.7]]).astype('float32')
-            x = paddle.to_variable(x_data)
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             out1 = paddle.sum(x)  # [3.5]
             out2 = paddle.sum(x, axis=0)  # [0.3, 0.5, 1.1, 1.6]
             out3 = paddle.sum(x, axis=-1)  # [1.9, 1.6]
             out4 = paddle.sum(x, axis=1, keepdim=True)  # [[1.9], [1.6]]
 
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
+            # y is a Tensor with shape [2, 2, 2] and elements as below:
             #      [[[1, 2], [3, 4]],
             #      [[5, 6], [7, 8]]]
             # Each example is followed by the corresponding output tensor.
-            y_data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]).astype('float32')
-            y = paddle.to_variable(y_data)
+            y = paddle.to_tensor([[[1, 2], [3, 4]], 
+                                  [[5, 6], [7, 8]]])
             out5 = paddle.sum(y, axis=[1, 2]) # [10, 26]
             out6 = paddle.sum(y, axis=[0, 1]) # [16, 20]
     """
@@ -850,10 +816,6 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
                     'out_dtype': convert_np_dtype_to_dtype_(dtype)
                 })
                 dtype_flag = True
-        else:
-            raise ValueError(
-                "The value of 'dtype' in sum op must be float64, int64, but received of {}".
-                format(dtype))
 
     if in_dygraph_mode():
         axis = axis if axis != None and axis != [] else [0]
@@ -867,6 +829,17 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
                                        'reduce_all', reduce_all_flag)
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int32', 'int64'], 'sum')
+
+    if dtype is not None:
+        check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'sum')
+        x_dtype = convert_dtype(x.dtype)
+
+        if (x_dtype == "float64" and dtype in ["float32", "int32"]) or \
+                (x_dtype == "int64" and dtype == "int32"):
+            raise ValueError("The input(x)'s dtype is {} but the attr(dtype) of sum is {}, "
+                             "which may cause data type overflows. Please reset attr(dtype) of sum."
+                             .format(x_dtype, dtype))
+
     check_type(axis, 'axis', (int, list, tuple, type(None)), 'sum')
 
     helper = LayerHelper('sum', **locals())
@@ -1121,9 +1094,9 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
             paddle.disable_static()
 
-            x = paddle.to_variable(data_x)
-            y = paddle.to_variable(data_y)
-            input = paddle.to_variable(data_input)
+            x = paddle.to_tensor(data_x)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input)
 
             out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
 
@@ -1204,12 +1177,10 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
     .. code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
 
-        x = np.array([[-1.5, 0., 2.], [3., 1.2, -2.4]])
-        x = paddle.to_tensor(x)
+        x = paddle.to_tensor([[-1.5, 0., 2.], [3., 1.2, -2.4]])
         out1 = paddle.logsumexp(x) # [3.4691226]
         out2 = paddle.logsumexp(x, 1) # [2.15317821, 3.15684602]
 
@@ -1260,12 +1231,10 @@ def inverse(x, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
-
-            mat_np = np.array([[2, 0], [0, 2]]).astype("float32")
             paddle.disable_static()
-            mat = paddle.to_variable(mat_np)
+
+            mat = paddle.to_tensor([[2, 0], [0, 2]], dtype='float32')
             inv = paddle.inverse(mat)
             print(inv) # [[0.5, 0], [0, 0.5]]
 
@@ -1316,16 +1285,15 @@ def max(x, axis=None, keepdim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
 
             # data_x is a variable with shape [2, 4]
             # the axis is a int element
-            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
-                               [0.1, 0.2, 0.6, 0.7]])
-            x = paddle.to_variable(data_x)
+
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             result1 = paddle.max(x)
             print(result1.numpy())
             #[0.9]
@@ -1342,9 +1310,9 @@ def max(x, axis=None, keepdim=False, name=None):
 
             # data_y is a variable with shape [2, 2, 2]
             # the axis is list 
-            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
-                               [[5.0, 6.0], [7.0, 8.0]]])
-            y = paddle.to_variable(data_y)
+
+            y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
+                                  [[5.0, 6.0], [7.0, 8.0]]])
             result5 = paddle.max(y, axis=[1, 2])
             print(result5.numpy())
             #[4. 8.]
@@ -1411,16 +1379,14 @@ def min(x, axis=None, keepdim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
 
-            # data_x is a variable with shape [2, 4]
+            # x is a tensor with shape [2, 4]
             # the axis is a int element
-            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
-                            [0.1, 0.2, 0.6, 0.7]])
-            x = paddle.to_variable(data_x)
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             result1 = paddle.min(x)
             print(result1.numpy())
             #[0.1]
@@ -1435,11 +1401,10 @@ def min(x, axis=None, keepdim=False, name=None):
             #[[0.2]
             # [0.1]]
 
-            # data_y is a variable with shape [2, 2, 2]
+            # y is a variable with shape [2, 2, 2]
             # the axis is list 
-            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
-                               [[5.0, 6.0], [7.0, 8.0]]])
-            y = paddle.to_variable(data_y)
+            y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
+                                  [[5.0, 6.0], [7.0, 8.0]]])
             result5 = paddle.min(y, axis=[1, 2])
             print(result5.numpy()) 
             #[1. 5.]
@@ -1596,11 +1561,9 @@ def clip(x, min=None, max=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x = np.array([[1.2,3.5], [4.5,6.4]]).astype('float32')
-            x1 = paddle.to_variable(x)
+            x1 = paddle.to_tensor([[1.2, 3.5], [4.5, 6.4]], 'float32')
             out1 = paddle.clip(x1, min=3.5, max=5.0)
             out2 = paddle.clip(x1, min=2.5)
             print(out1.numpy())
@@ -1653,7 +1616,7 @@ def clip(x, min=None, max=None, name=None):
 
     helper = LayerHelper('clip', **locals())
     output = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+        dtype=helper.input_dtype('x'))
     helper.append_op(
         type='clip', inputs=inputs, outputs={'Out': [output]}, attrs=attrs)
 
@@ -1701,9 +1664,9 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
 
             paddle.disable_static()
 
-            case1 = paddle.to_variable(case1)
-            case2 = paddle.to_variable(case2)
-            case3 = paddle.to_variable(case3)
+            case1 = paddle.to_tensor(case1)
+            case2 = paddle.to_tensor(case2)
+            case3 = paddle.to_tensor(case3)
             data1 = paddle.trace(case1) # data1.shape = [1]
             data2 = paddle.trace(case2, offset=1, axis1=1, axis2=2) # data2.shape = [3]
             data3 = paddle.trace(case3, offset=-3, axis1=1, axis2=-1) # data2.shape = [3, 5]
@@ -1894,10 +1857,8 @@ def isfinite(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             paddle.disable_static()
-            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            x = paddle.to_tensor(x_np)
+            x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
             out = paddle.tensor.isfinite(x)
             print(out.numpy())  # [False  True  True False  True False False]
     """
@@ -1925,10 +1886,8 @@ def isinf(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             paddle.disable_static()
-            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            x = paddle.to_tensor(x_np)
+            x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
             out = paddle.tensor.isinf(x)
             print(out.numpy())  # [ True False False  True False False False]
     """
@@ -1956,10 +1915,8 @@ def isnan(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             paddle.disable_static()
-            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            x = paddle.to_tensor(x_np)
+            x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
             out = paddle.tensor.isnan(x)
             print(out.numpy())  # [False False False False False  True  True]
     """
@@ -2002,14 +1959,12 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
             # the axis is a int element
-            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
-                         [0.1, 0.2, 0.6, 0.7]]).astype(np.float32)
-            x = paddle.to_tensor(data_x)
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             out1 = paddle.prod(x)
             print(out1.numpy())
             # [0.0002268]
@@ -2035,9 +1990,8 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
             # int64
 
             # the axis is list
-            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
-                               [[5.0, 6.0], [7.0, 8.0]]])
-            y = paddle.to_tensor(data_y)
+            y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
+                                  [[5.0, 6.0], [7.0, 8.0]]])
             out6 = paddle.prod(y, [0, 1])
             print(out6.numpy())
             # [105. 384.]
@@ -2070,12 +2024,10 @@ def sign(x, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
 
-          data = np.array([3.0, 0.0, -2.0, 1.7], dtype='float32')
           paddle.disable_static()
-          x = paddle.to_tensor(data)
+          x = paddle.to_tensor([3.0, 0.0, -2.0, 1.7], dtype='float32')
           out = paddle.sign(x=x)
           print(out)  # [1.0, 0.0, -1.0, 1.0]
     """
@@ -2110,12 +2062,9 @@ def tanh(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-
-            x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
             out = paddle.tanh(x)
             print(out.numpy())
             # [-0.37994896 -0.19737532  0.09966799  0.29131261]
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 6b08599fad1dfc6b5d60c3798bba802a5ddefd02..b38a1d0f5b7e92b0eac907170aad76a2b5c69bc1 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -14,17 +14,12 @@
 
 # TODO: define random functions  
 
-import numpy as np
-
 from ..fluid import core
-from ..fluid.framework import device_guard, in_dygraph_mode, _varbase_creator, Variable, convert_np_dtype_to_dtype_
-from ..fluid.layers.layer_function_generator import templatedoc
+from ..fluid.framework import in_dygraph_mode, Variable, convert_np_dtype_to_dtype_
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, check_shape
 from ..fluid.layers import utils
-from ..fluid.layers.tensor import fill_constant
 import paddle
-import warnings
 
 from ..fluid.io import shuffle  #DEFINE_ALIAS
 
@@ -65,7 +60,6 @@ def bernoulli(x, name=None):
         .. code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
 
@@ -94,26 +88,26 @@ def bernoulli(x, name=None):
     return out
 
 
-def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
+def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
     """
     This OP returns a Tensor filled with random values sampled from a Gaussian
     distribution, with ``shape`` and ``dtype``.
 
     Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        mean(float|int, optional): Mean of the output tensor, default is 0.0.
-        std(float|int, optional): Standard deviation of the output tensor, default
+        mean (float|int, optional): Mean of the output tensor, default is 0.0.
+        std (float|int, optional): Standard deviation of the output tensor, default
             is 1.0.
-        seed(int, optional): ${seed_comment}
-        dtype(str|np.dtype, optional): The data type of the output Tensor.
+        seed (int, optional): Random seed of generator.
+        dtype (str|np.dtype, optional): The data type of the output Tensor.
             Supported data types: float32, float64.
             Default is None, use global default dtype (see ``get_default_dtype``
             for details).
-        name(str, optional): The default value is None. Normally there is no
+        name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -121,26 +115,26 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
         Tensor: A Tensor filled with random values sampled from a Gaussian
         distribution, with ``shape`` and ``dtype``. 
     """
+    op_type_for_check = 'gaussian/standard_normal/randn/normal'
+    seed = 0
+
     if dtype is None:
         dtype = paddle.framework.get_default_dtype()
         if dtype not in ['float32', 'float64']:
             raise TypeError(
-                "gaussian_random only supports [float32, float64], but the default dtype is %s"
-                % dtype)
-
+                "{} only supports [float32, float64], but the default dtype is {}"
+                .format(op_type_for_check, dtype))
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
-    seed = 0
-    op_type_for_check = 'gaussian_random/standard_normal/randn/normal'
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.gaussian_random('shape', shape, 'mean',
                                         float(mean), 'std',
                                         float(std), 'seed', seed, 'dtype',
                                         dtype)
 
-    check_type(shape, 'shape', (list, tuple, Variable), op_type_for_check)
+    check_shape(shape, op_type_for_check)
     check_dtype(dtype, 'dtype', ['float32', 'float64'], op_type_for_check)
 
     inputs = {}
@@ -151,10 +145,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
         'dtype': dtype,
         'use_mkldnn': False
     }
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type=op_type_for_check)
 
-    helper = LayerHelper('gaussian_random', **locals())
+    helper = LayerHelper('gaussian', **locals())
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='gaussian_random',
@@ -172,12 +166,12 @@ def standard_normal(shape, dtype=None, name=None):
     and ``dtype``.
 
     Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype, optional): The data type of the output Tensor.
+        dtype (str|np.dtype, optional): The data type of the output Tensor.
             Supported data types: float32, float64.
             Default is None, use global default dtype (see ``get_default_dtype``
             for details).
@@ -189,27 +183,22 @@ def standard_normal(shape, dtype=None, name=None):
         normal distribution with mean 0 and standard deviation 1, with
         ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not float32, float64.
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
             # example 1: attr shape is a list which doesn't contain Tensor.
-            result_1 = paddle.standard_normal(shape=[2, 3])
+            out1 = paddle.standard_normal(shape=[2, 3])
             # [[-2.923464  ,  0.11934398, -0.51249987],  # random
             #  [ 0.39632758,  0.08177969,  0.2692008 ]]  # random
 
             # example 2: attr shape is a list which contains Tensor.
-            dim_1 = paddle.fill_constant([1], "int64", 2)
-            dim_2 = paddle.fill_constant([1], "int32", 3)
-            result_2 = paddle.standard_normal(shape=[dim_1, dim_2, 2])
+            dim1 = paddle.full([1], 2, "int64")
+            dim2 = paddle.full([1], 3, "int32")
+            out2 = paddle.standard_normal(shape=[dim1, dim2, 2])
             # [[[-2.8852394 , -0.25898588],  # random
             #   [-0.47420555,  0.17683524],  # random
             #   [-0.7989969 ,  0.00754541]],  # random
@@ -218,21 +207,14 @@ def standard_normal(shape, dtype=None, name=None):
             #   [ 0.8086993 ,  0.6868893 ]]]  # random
 
             # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-            var_shape = paddle.to_tensor(np.array([2, 3]))
-            result_3 = paddle.standard_normal(var_shape)
+            shape_tensor = paddle.to_tensor([2, 3])
+            result_3 = paddle.standard_normal(shape_tensor)
+
             # [[-2.878077 ,  0.17099959,  0.05111201]  # random
             #  [-0.3761474, -1.044801  ,  1.1870178 ]]  # random
 
     """
-    if dtype is None:
-        dtype = paddle.framework.get_default_dtype()
-        if dtype not in ['float32', 'float64']:
-            raise TypeError(
-                "standard_normal only supports [float32, float64], but the default dtype is %s"
-                % dtype)
-
-    return gaussian_random(
-        shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
+    return gaussian(shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
 
 
 randn = standard_normal
@@ -275,7 +257,6 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
@@ -283,11 +264,11 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
             # [[ 0.17501129  0.32364586  1.561118  ]  # random
             #  [-1.7232178   1.1545963  -0.76156676]]  # random
 
-            mean_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            mean_tensor = paddle.to_tensor([1.0, 2.0, 3.0])
             out2 = paddle.normal(mean=mean_tensor)
             # [ 0.18644847 -1.19434458  3.93694787]  # random
 
-            std_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            std_tensor = paddle.to_tensor([1.0, 2.0, 3.0])
             out3 = paddle.normal(mean=mean_tensor, std=std_tensor)
             # [1.00780561 3.78457445 5.81058198]  # random
 
@@ -306,16 +287,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
                 "If std is Tensor, it's data type only support float32, float64."
             )
         if shape is not None:
-            if isinstance(shape, (list, tuple)):
-                for item in shape:
-                    check_type(item, 'shape', (int), 'normal',
-                               'Elements of shape should be int.')
-            elif isinstance(shape, Variable):
-                check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'normal')
-            else:
-                assert TypeError(
-                    'If mean and std are all not Tensor, shape should be list, tuple, Tensor.'
-                )
+            check_shape(shape, 'normal')
 
     if isinstance(mean, Variable):
         if isinstance(std, Variable):
@@ -330,7 +302,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
         mean = float(mean)
         out = standard_normal(paddle.shape(std), std.dtype, name)
     else:
-        return gaussian_random(shape=shape, mean=mean, std=std, name=name)
+        return gaussian(shape=shape, mean=mean, std=std, name=name)
 
     out = out * std + mean
     if not in_dygraph_mode():
@@ -383,7 +355,6 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     Examples:
         .. code-block:: python
             
-            import numpy as np
             import paddle
 
             paddle.disable_static()
@@ -405,8 +376,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
             # example 3:
             # attr shape is a Tensor, the data type must be int64 or int32.
-            shape = np.array([2, 3])
-            shape_tensor = paddle.to_tensor(shape)
+            shape_tensor = paddle.to_tensor([2, 3])
             result_3 = paddle.tensor.random.uniform(shape_tensor)
             # if shape_tensor's value is [2, 3]
             # result_3 is:
@@ -419,27 +389,27 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
         dtype = paddle.framework.get_default_dtype()
         if dtype not in ['float32', 'float64']:
             raise TypeError(
-                "uniform only supports [float32, float64], but the default dtype is %s"
-                % dtype)
+                "uniform/rand only supports [float32, float64], but the default dtype is {}".
+                format(dtype))
 
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.uniform_random('shape', shape, 'min',
                                        float(min), 'max',
                                        float(max), 'seed', seed, 'dtype', dtype)
 
-    check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
-    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform_random/rand')
+    check_type(shape, 'shape', (list, tuple, Variable), 'uniform/rand')
+    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform/rand')
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    utils._get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand')
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform/rand')
 
-    helper = LayerHelper("uniform_random", **locals())
+    helper = LayerHelper("uniform", **locals())
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="uniform_random", inputs=inputs, attrs=attrs,
@@ -449,29 +419,26 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
 def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     """
-	:alias_main: paddle.randint
-	:alias: paddle.tensor.randint, paddle.tensor.random.randint
-
     This OP returns a Tensor filled with random integers from a discrete uniform
     distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
     If ``high`` is None (the default), the range is [0, ``low``).
 
     Args:
-        low(int): The lower bound on the range of random values to generate.
+        low (int): The lower bound on the range of random values to generate.
             The ``low`` is included in the range. If ``high`` is None, the
             range is [0, ``low``). Default is 0.
-        high(int, optional): The upper bound on the range of random values to
+        high (int, optional): The upper bound on the range of random values to
             generate, the ``high`` is excluded in the range. Default is None
             (see above for behavior if high = None). Default is None.
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64). Default is [1].
-        dtype(str|np.dtype, optional): The data type of the
+        dtype (str|np.dtype, optional): The data type of the
             output tensor. Supported data types: int32, int64. If ``dytpe``
             is None, the data type is int64. Default is None.
-        name(str, optional): The default value is None.  Normally there is no
+        name (str, optional): The default value is None.  Normally there is no
             need for user to set this property.  For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -479,48 +446,43 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
         Tensor: A Tensor filled with random integers from a discrete uniform
         distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not int32, int64.
-        ValueError: If ``high`` is not greater then ``low``; If ``high`` is 
-            None, and ``low`` is not greater than 0.
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
             # example 1:
             # attr shape is a list which doesn't contain Tensor.
-            result_1 = paddle.randint(low=-5, high=5, shape=[3])
+            out1 = paddle.randint(low=-5, high=5, shape=[3])
             # [0, -3, 2]  # random
 
             # example 2:
             # attr shape is a list which contains Tensor.
-            dim_1 = paddle.fill_constant([1], "int64", 2)
-            dim_2 = paddle.fill_constant([1], "int32", 3)
-            result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
+            dim1 = paddle.full([1], 2, "int64")
+            dim2 = paddle.full([1], 3, "int32")
+            out2 = paddle.randint(low=-5, high=5, shape=[dim1, dim2], dtype="int32")
             # [[0, -1, -3],  # random
             #  [4, -2,  0]]  # random
 
             # example 3:
             # attr shape is a Tensor
-            var_shape = paddle.to_variable(np.array([3]))
-            result_3 = paddle.randint(low=-5, high=5, shape=var_shape)
+
+            shape_tensor = paddle.to_tensor(3)
+            result_3 = paddle.randint(low=-5, high=5, shape=shape_tensor)
+
             # [-2, 2, 3]  # random
 
             # example 4:
             # data type is int32
-            result_4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
+            out4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
             # [-5, 4, -4]  # random
 
             # example 5:
             # Input only one parameter
             # low=0, high=10, shape=[1], dtype='int64'
-            result_5 = paddle.randint(10)
+            out5 = paddle.randint(10)
             # [7]  # random
 
     """
@@ -537,11 +499,11 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.randint('shape', shape, 'low', low, 'high', high,
                                 'seed', 0, 'dtype', dtype)
 
-    check_type(shape, 'shape', (list, tuple, Variable), 'randint')
+    check_shape(shape, 'randint')
     check_dtype(dtype, 'dtype', ['int32', 'int64'], 'randint')
     if low >= high:
         raise ValueError(
@@ -550,7 +512,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
 
     inputs = dict()
     attrs = {'low': low, 'high': high, 'seed': 0, 'dtype': dtype}
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type='randint')
 
     helper = LayerHelper("randint", **locals())
@@ -560,21 +522,17 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     return out
 
 
-@templatedoc()
 def randperm(n, dtype="int64", name=None):
     """
-	:alias_main: paddle.randperm
-	:alias: paddle.tensor.randperm, paddle.tensor.random.randperm
-
     This OP returns a 1-D Tensor filled with random permutation values from 0
     to n-1, with ``dtype``.
 
     Args:
-        n(int): The upper bound (exclusive), and it should be greater than 0.
-        dtype(str|np.dtype, optional): The data type of
+        n (int): The upper bound (exclusive), and it should be greater than 0.
+        dtype (str|np.dtype, optional): The data type of
             the output Tensor. Supported data types: int32, int64, float32,
             float64. Default is int64.
-        name(str, optional): The default value is None. Normally there is no
+        name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -582,10 +540,6 @@ def randperm(n, dtype="int64", name=None):
         Tensor: A 1-D Tensor filled with random permutation values from 0
         to n-1, with ``dtype``.
 
-    Raises:
-        ValueError: If ``n`` is not greater than 0.
-        TypeError: If ``dtype`` is not int32, int64, float32, float64.
-
     Examples:
         .. code-block:: python
 
@@ -593,10 +547,10 @@ def randperm(n, dtype="int64", name=None):
 
             paddle.disable_static()
 
-            result_1 = paddle.randperm(5)
+            out1 = paddle.randperm(5)
             # [4, 1, 2, 3, 0]  # random
 
-            result_2 = paddle.randperm(7, 'int32')
+            out2 = paddle.randperm(7, 'int32')
             # [1, 6, 2, 0, 4, 3, 5]  # random
  
     """
@@ -622,32 +576,20 @@ def randperm(n, dtype="int64", name=None):
 
 def rand(shape, dtype=None, name=None):
     """
-	:alias_main: paddle.rand
-	:alias: paddle.tensor.rand, paddle.tensor.random.rand
-
     This OP returns a Tensor filled with random values sampled from a uniform
     distribution in the range [0, 1), with ``shape`` and ``dtype``.
 
-    Examples:
-    ::
-
-        Input:
-          shape = [1, 2]
-
-        Output:
-          result=[[0.8505902, 0.8397286]]
-
     Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype, optional): The data type of the output Tensor.
+        dtype (str|np.dtype, optional): The data type of the output Tensor.
             Supported data types: float32, float64.
             Default is None, use global default dtype (see ``get_default_dtype``
             for details).
-        name(str, optional): The default value is None. Normally there is no
+        name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -655,26 +597,21 @@ def rand(shape, dtype=None, name=None):
         Tensor: A Tensor filled with random values sampled from a uniform
         distribution in the range [0, 1), with ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        ValueError: If ``dtype`` is not float32, float64.
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
             # example 1: attr shape is a list which doesn't contain Tensor.
-            result_1 = paddle.rand(shape=[2, 3])
+            out1 = paddle.rand(shape=[2, 3])
             # [[0.451152  , 0.55825245, 0.403311  ],  # random
             #  [0.22550228, 0.22106001, 0.7877319 ]]  # random
 
             # example 2: attr shape is a list which contains Tensor.
-            dim_1 = paddle.fill_constant([1], "int64", 2)
-            dim_2 = paddle.fill_constant([1], "int32", 3)
-            result_2 = paddle.rand(shape=[dim_1, dim_2, 2])
+            dim1 = paddle.full([1], 2, "int64")
+            dim2 = paddle.full([1], 3, "int32")
+            out2 = paddle.rand(shape=[dim1, dim2, 2])
             # [[[0.8879919 , 0.25788337],  # random
             #   [0.28826773, 0.9712097 ],  # random
             #   [0.26438272, 0.01796806]],  # random
@@ -683,19 +620,11 @@ def rand(shape, dtype=None, name=None):
             #   [0.870881  , 0.2984597 ]]]  # random
 
             # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-            var_shape = paddle.to_variable(np.array([2, 3]))
-            result_3 = paddle.rand(var_shape)
+            shape_tensor = paddle.to_tensor([2, 3])
+            result_3 = paddle.rand(shape_tensor)
+
             # [[0.22920267, 0.841956  , 0.05981819],  # random
             #  [0.4836288 , 0.24573246, 0.7516129 ]]  # random
 
     """
-    if dtype is None:
-        dtype = paddle.framework.get_default_dtype()
-        if dtype not in ['float32', 'float64']:
-            raise TypeError(
-                "rand only supports [float32, float64], but the default dtype is %s"
-                % dtype)
-
-    out = uniform(shape, dtype, min=0.0, max=1.0, name=name)
-    out.stop_gradient = True
-    return out
+    return uniform(shape, dtype, min=0.0, max=1.0, name=name)
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 552da3401c61d9c046c29bc86b429a8ae1242fa5..f55d285586f0ec6959573af64e720bea5de10c8d 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -66,16 +66,15 @@ def argsort(x, axis=-1, descending=False, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
             
             paddle.disable_static()
-            input_array = np.array([[[5,8,9,5],
-                            [0,0,1,7],
-                            [6,9,2,4]],
-                            [[5,2,4,2],
-                            [4,7,7,9],
-                            [1,7,0,6]]]).astype(np.float32)
-            x = paddle.to_variable(input_array)
+            x = paddle.to_tensor([[[5,8,9,5],
+                                   [0,0,1,7],
+                                   [6,9,2,4]],
+                                  [[5,2,4,2],
+                                   [4,7,7,9],
+                                   [1,7,0,6]]], 
+                                dtype='float32')
             out1 = paddle.argsort(x=x, axis=-1)
             out2 = paddle.argsort(x=x, axis=0)
             out3 = paddle.argsort(x=x, axis=1)
@@ -148,14 +147,12 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            data = np.array([[5,8,9,5],
-                             [0,0,1,7],
-                             [6,9,2,4]])
-            x =  paddle.to_variable(data)
+            x =  paddle.to_tensor([[5,8,9,5],
+                                     [0,0,1,7],
+                                     [6,9,2,4]])
             out1 = paddle.argmax(x)
             print(out1.numpy()) # 2
             out2 = paddle.argmax(x, axis=1)
@@ -169,6 +166,12 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
         raise TypeError(
             "The type of 'axis'  must be int or None in argmax, but received %s."
             % (type(axis)))
+
+    if dtype is None:
+        raise ValueError(
+            "the value of 'dtype' in argmax could not be None, but received None"
+        )
+
     var_dtype = convert_np_dtype_to_dtype_(dtype)
     check_dtype(var_dtype, 'dtype', ['int32', 'int64'], 'argmin')
     flatten = False
@@ -222,14 +225,12 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            data = np.array([[5,8,9,5],
-                             [0,0,1,7],
-                             [6,9,2,4]])
-            x =  paddle.to_variable(data)
+            x =  paddle.to_tensor([[5,8,9,5],
+                                     [0,0,1,7],
+                                     [6,9,2,4]])
             out1 = paddle.argmin(x)
             print(out1.numpy()) # 4
             out2 = paddle.argmin(x, axis=1)
@@ -243,6 +244,12 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
         raise TypeError(
             "The type of 'axis'  must be int or None in argmin, but received %s."
             % (type(axis)))
+
+    if dtype is None:
+        raise ValueError(
+            "the value of 'dtype' in argmin could not be None, but received None"
+        )
+
     var_dtype = convert_np_dtype_to_dtype_(dtype)
     check_dtype(var_dtype, 'dtype', ['int32', 'int64'], 'argmin')
     flatten = False
@@ -292,24 +299,16 @@ def index_select(x, index, axis=0, name=None):
     Returns:
         Tensor: A Tensor with same data type as ``x``.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of  float32, float64, int32 and int64.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
-
     Examples:
         .. code-block:: python
             
             import paddle
-            import numpy as np
 
             paddle.disable_static()  # Now we are in imperative mode
-            data = np.array([[1.0, 2.0, 3.0, 4.0],
-                             [5.0, 6.0, 7.0, 8.0],
-                             [9.0, 10.0, 11.0, 12.0]])
-            data_index = np.array([0, 1, 1]).astype('int32')
-
-            x = paddle.to_tensor(data)
-            index = paddle.to_tensor(data_index)
+            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+                                  [5.0, 6.0, 7.0, 8.0],
+                                  [9.0, 10.0, 11.0, 12.0]])
+            index = paddle.to_tensor([0, 1, 1], dtype='int32')
             out_z1 = paddle.index_select(x=x, index=index)
             #[[1. 2. 3. 4.]
             # [5. 6. 7. 8.]
@@ -363,48 +362,44 @@ def nonzero(input, as_tuple=False):
     Examples:
         .. code-block:: python
             import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            data1 = np.array([[1.0, 0.0, 0.0],
-                              [0.0, 2.0, 0.0],
-                              [0.0, 0.0, 3.0]])
-            data2 = np.array([0.0, 1.0, 0.0, 3.0])
-            data3 = np.array([0.0, 0.0, 0.0])
-            with fluid.dygraph.guard():
-                x1 = fluid.dygraph.to_variable(data1)
-                x2 = fluid.dygraph.to_variable(data2)
-                x3 = fluid.dygraph.to_variable(data3)
-                out_z1 = paddle.nonzero(x1)
-                print(out_z1.numpy())
-                #[[0 0]
-                # [1 1]
-                # [2 2]]
-                out_z1_tuple = paddle.nonzero(x1, as_tuple=True)
-                for out in out_z1_tuple:
-                    print(out.numpy())
-                #[[0]
-                # [1]
-                # [2]]
-                #[[0]
-                # [1]
-                # [2]]
-                out_z2 = paddle.nonzero(x2)
-                print(out_z2.numpy())
-                #[[1]
-                # [3]]
-                out_z2_tuple = paddle.nonzero(x2, as_tuple=True)
-                for out in out_z2_tuple:
-                    print(out.numpy())
-                #[[1]
-                # [3]]
-                out_z3 = paddle.nonzero(x3)
-                print(out_z3.numpy())
-                #[]
-                out_z3_tuple = paddle.nonzero(x3, as_tuple=True)
-                for out in out_z3_tuple:
-                    print(out.numpy())
-                #[]                    
+
+            paddle.disable_static()
+
+            x1 = paddle.to_tensor([[1.0, 0.0, 0.0],
+                          [0.0, 2.0, 0.0],
+                          [0.0, 0.0, 3.0]])
+            x2 = paddle.to_tensor([0.0, 1.0, 0.0, 3.0])
+            x3 = paddle.to_tensor([0.0, 0.0, 0.0])
+            out_z1 = paddle.nonzero(x1)
+            print(out_z1.numpy())
+            #[[0 0]
+            # [1 1]
+            # [2 2]]
+            out_z1_tuple = paddle.nonzero(x1, as_tuple=True)
+            for out in out_z1_tuple:
+                print(out.numpy())
+            #[[0]
+            # [1]
+            # [2]]
+            #[[0]
+            # [1]
+            # [2]]
+            out_z2 = paddle.nonzero(x2)
+            print(out_z2.numpy())
+            #[[1]
+            # [3]]
+            out_z2_tuple = paddle.nonzero(x2, as_tuple=True)
+            for out in out_z2_tuple:
+                print(out.numpy())
+            #[[1]
+            # [3]]
+            out_z3 = paddle.nonzero(x3)
+            print(out_z3.numpy())
+            #[]
+            out_z3_tuple = paddle.nonzero(x3, as_tuple=True)
+            for out in out_z3_tuple:
+                print(out.numpy())
+            #[]                    
     """
     list_out = []
     shape = input.shape
@@ -451,16 +446,15 @@ def sort(x, axis=-1, descending=False, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
             
             paddle.disable_static()
-            input_array = np.array([[[5,8,9,5],
-                            [0,0,1,7],
-                            [6,9,2,4]],
-                            [[5,2,4,2],
-                            [4,7,7,9],
-                            [1,7,0,6]]]).astype(np.float32)
-            x = paddle.to_variable(input_array)
+            x = paddle.to_tensor([[[5,8,9,5],
+                                   [0,0,1,7],
+                                   [6,9,2,4]],
+                                  [[5,2,4,2],
+                                   [4,7,7,9],
+                                   [1,7,0,6]]], 
+                                 dtype='float32')
             out1 = paddle.sort(x=x, axis=-1)
             out2 = paddle.sort(x=x, axis=0)
             out3 = paddle.sort(x=x, axis=1)
@@ -536,16 +530,11 @@ def where(condition, x, y, name=None):
         .. code-block:: python
 
           import paddle
-          import numpy as np
-          import paddle.fluid as fluid
 
-          x_i = np.array([0.9383, 0.1983, 3.2, 1.2]).astype("float32")
-          y_i = np.array([1.0, 1.0, 1.0, 1.0]).astype("float32")
-
-          with fluid.dygraph.guard():
-              x = fluid.dygraph.to_variable(x_i)
-              y = fluid.dygraph.to_variable(y_i)
-              out = paddle.where(x>1, x, y)
+          paddle.disable_static()
+          x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2])
+          y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0])
+          out = paddle.where(x>1, x, y)
 
           print(out.numpy())
           #out: [1.0, 1.0, 3.2, 1.2]
@@ -622,50 +611,41 @@ def index_sample(x, index):
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            data = np.array([[1.0, 2.0, 3.0, 4.0],
-                                [5.0, 6.0, 7.0, 8.0],
-                                [9.0, 10.0, 11.0, 12.0]]).astype('float32')
-
-            data_index = np.array([[0, 1, 2],
-                                    [1, 2, 3],
-                                    [0, 0, 0]]).astype('int32')
-
-            target_data = np.array([[100, 200, 300, 400],
-                                    [500, 600, 700, 800],
-                                    [900, 1000, 1100, 1200]]).astype('int32')
-
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(data)
-                index = fluid.dygraph.to_variable(data_index)
-                target = fluid.dygraph.to_variable(target_data)
-
-                out_z1 = paddle.index_sample(x, index)
-                print(out_z1.numpy())
-                #[[1. 2. 3.]
-                # [6. 7. 8.]
-                # [9. 9. 9.]]
-
-                # Use the index of the maximum value by topk op
-                # get the value of the element of the corresponding index in other tensors
-                top_value, top_index = fluid.layers.topk(x, k=2)
-                out_z2 = paddle.index_sample(target, top_index)
-                print(top_value.numpy())
-                #[[ 4.  3.]
-                # [ 8.  7.]
-                # [12. 11.]]
-
-                print(top_index.numpy())
-                #[[3 2]
-                # [3 2]
-                # [3 2]]
-
-                print(out_z2.numpy())
-                #[[ 400  300]
-                # [ 800  700]
-                # [1200 1100]]
+
+            paddle.disable_static()
+            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+                                  [5.0, 6.0, 7.0, 8.0],
+                                  [9.0, 10.0, 11.0, 12.0]], dtype='float32')
+            index = paddle.to_tensor([[0, 1, 2],
+                                      [1, 2, 3],
+                                      [0, 0, 0]], dtype='int32')
+            target = paddle.to_tensor([[100, 200, 300, 400],
+                                       [500, 600, 700, 800],
+                                       [900, 1000, 1100, 1200]], dtype='int32')
+            out_z1 = paddle.index_sample(x, index)
+            print(out_z1.numpy())
+            #[[1. 2. 3.]
+            # [6. 7. 8.]
+            # [9. 9. 9.]]
+
+            # Use the index of the maximum value by topk op
+            # get the value of the element of the corresponding index in other tensors
+            top_value, top_index = paddle.topk(x, k=2)
+            out_z2 = paddle.index_sample(target, top_index)
+            print(top_value.numpy())
+            #[[ 4.  3.]
+            # [ 8.  7.]
+            # [12. 11.]]
+
+            print(top_index.numpy())
+            #[[3 2]
+            # [3 2]
+            # [3 2]]
+
+            print(out_z2.numpy())
+            #[[ 400  300]
+            # [ 800  700]
+            # [1200 1100]]
 
 
     """
@@ -698,27 +678,20 @@ def masked_select(x, mask, name=None):
 
     Returns: A 1-D Tensor which is the same data type  as ``x``.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of  float32, float64, int32 and int64.
-        TypeError: ``mask`` must be a Tensor and the data type of ``mask`` must be bool.
-
     Examples:
 
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            
+
             paddle.disable_static()
-            data = np.array([[1.0, 2.0, 3.0, 4.0],
-                                [5.0, 6.0, 7.0, 8.0],
-                                [9.0, 10.0, 11.0, 12.0]]).astype('float32')
-            
-            mask_data = np.array([[True, False, False, False],
-                            [True, True, False, False],
-                            [True, False, False, False]]).astype('bool')
-            x = paddle.to_tensor(data)
-            mask = paddle.to_tensor(mask_data)
+
+            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+                                  [5.0, 6.0, 7.0, 8.0],
+                                  [9.0, 10.0, 11.0, 12.0]])
+            mask = paddle.to_tensor([[True, False, False, False],
+                                     [True, True, False, False],
+                                     [True, False, False, False]])
             out = paddle.masked_select(x, mask)
             #[1.0 5.0 6.0 9.0]
     """
@@ -763,20 +736,17 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None):
 
         .. code-block:: python
 
-           import numpy as np
            import paddle
 
            paddle.disable_static()
 
-           data_1 = np.array([1, 4, 5, 7])
-           tensor_1 = paddle.to_tensor(data_1)
+           tensor_1 = paddle.to_tensor([1, 4, 5, 7])
            value_1, indices_1 = paddle.topk(tensor_1, k=1)
            print(value_1.numpy())
            # [7]
            print(indices_1.numpy())
            # [3] 
-           data_2 = np.array([[1, 4, 5, 7], [2, 6, 2, 5]])
-           tensor_2 = paddle.to_tensor(data_2)
+           tensor_2 = paddle.to_tensor([[1, 4, 5, 7], [2, 6, 2, 5]])
            value_2, indices_2 = paddle.topk(tensor_2, k=1)
            print(value_2.numpy())
            # [[7]
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 91676a6316b81a1998b9b48fb9ea7fcba6d67c25..d56dff5a81018e13e1c186f66172f868b0c4074b 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -237,10 +237,6 @@ def numel(x, name=None):
 
     Returns:
         Tensor: The number of elements for the input Tensor.
-    
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index e078595dc9551763f2c4fc1b17f5b4220e3b1f6d..b7b5d44650f8d62926241a57feedfd5b932a37f5 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -499,6 +499,30 @@ class TestModelFunction(unittest.TestCase):
             self.assertTrue(params[0].shape[1] == 10)
             fluid.disable_dygraph() if dynamic else None
 
+    def test_summary(self):
+        def _get_param_from_state_dict(state_dict):
+            params = 0
+            for k, v in state_dict.items():
+                params += np.prod(v.numpy().shape)
+            return params
+
+        for dynamic in [True, False]:
+            device = paddle.set_device('cpu')
+            fluid.enable_dygraph(device) if dynamic else None
+            net = MyModel()
+            inputs = [InputSpec([None, 20], 'float32', 'x')]
+            model = Model(net, inputs)
+            model.prepare()
+            params_info = model.summary()
+            gt_params = _get_param_from_state_dict(net.state_dict())
+
+            np.testing.assert_allclose(params_info['total_params'], gt_params)
+            print(params_info)
+
+            model.summary(input_size=(20))
+            model.summary(input_size=[(20)])
+            model.summary(input_size=(20), batch_size=2)
+
     def test_export_deploy_model(self):
         for dynamic in [True, False]:
             fluid.enable_dygraph() if dynamic else None
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index f6299980b3e5c0bd0c7551b6b51c9b067d7960b5..2a649c776b4103b1d3d8648957bbff7a32007410 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .plot import Ploter
 from .profiler import ProfilerOptions
 from .profiler import Profiler
 from .profiler import get_profiler
 from .deprecated import deprecated
+
 from . import download
 
-__all__ = ['dump_config', 'Ploter', 'deprecated', 'download']
+__all__ = ['dump_config', 'deprecated', 'download']
 
 #TODO: define new api under this directory
 # __all__ = ['unique_name',
diff --git a/python/paddle/utils/plot.py b/python/paddle/utils/plot.py
deleted file mode 100644
index ee651f2f0cd6f2e594a4e74c896baa924f70bbf5..0000000000000000000000000000000000000000
--- a/python/paddle/utils/plot.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import six
-
-
-class PlotData(object):
-    def __init__(self):
-        self.step = []
-        self.value = []
-
-    def append(self, step, value):
-        self.step.append(step)
-        self.value.append(value)
-
-    def reset(self):
-        self.step = []
-        self.value = []
-
-
-class Ploter(object):
-    """
-        Plot input data in a 2D graph
-        
-        Args:
-            title: assign the title of input data.
-            step: x_axis of the data.
-            value: y_axis of the data.
-    """
-
-    def __init__(self, *args):
-        self.__args__ = args
-        self.__plot_data__ = {}
-        for title in args:
-            self.__plot_data__[title] = PlotData()
-        # demo in notebooks will use Ploter to plot figure, but when we convert
-        # the ipydb to py file for testing, the import of matplotlib will make the
-        # script crash. So we can use `export DISABLE_PLOT=True` to disable import
-        # these libs
-        self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
-        if not self.__plot_is_disabled__():
-            import matplotlib.pyplot as plt
-            from IPython import display
-            self.plt = plt
-            self.display = display
-
-    def __plot_is_disabled__(self):
-        return self.__disable_plot__ == "True"
-
-    def append(self, title, step, value):
-        """
-        Feed data
-
-        Args:
-                title: assign the group data to this subtitle.
-                step: the x_axis of data.
-                value: the y_axis of data.
-            
-            Examples:
-                .. code-block:: python
-                plot_curve = Ploter("Curve 1","Curve 2")
-                plot_curve.append(title="Curve 1",step=1,value=1)
-        """
-        assert isinstance(title, six.string_types)
-        assert title in self.__plot_data__
-        data = self.__plot_data__[title]
-        assert isinstance(data, PlotData)
-        data.append(step, value)
-
-    def plot(self, path=None):
-        """
-            Plot data in a 2D graph
-
-            Args:
-                path: store the figure to this file path. Defaul None. 
-              
-            Examples:
-                .. code-block:: python
-                plot_curve = Ploter()
-                plot_cure.plot()
-        """
-        if self.__plot_is_disabled__():
-            return
-
-        titles = []
-        for title in self.__args__:
-            data = self.__plot_data__[title]
-            assert isinstance(data, PlotData)
-            if len(data.step) > 0:
-                titles.append(title)
-                self.plt.plot(data.step, data.value)
-        self.plt.legend(titles, loc='upper left')
-        if path is None:
-            self.display.clear_output(wait=True)
-            self.display.display(self.plt.gcf())
-        else:
-            self.plt.savefig(path)
-        self.plt.gcf().clear()
-
-    def reset(self):
-        for key in self.__plot_data__:
-            data = self.__plot_data__[key]
-            assert isinstance(data, PlotData)
-            data.reset()
diff --git a/python/setup.py.in b/python/setup.py.in
index 64ac2b9b9a4d210c59193e117c6000986bfb07a0..773166400347ab550f82e4fabcb0d89b90818fc2 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -156,6 +156,7 @@ packages=['paddle',
           'paddle.framework',
           'paddle.jit',
           'paddle.fluid',
+          'paddle.fluid.inference',
           'paddle.fluid.dygraph',
           'paddle.fluid.dygraph.dygraph_to_static',
           'paddle.fluid.dygraph.amp',
diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a1881f551da1ca022c186c50c667e51dff89f9be
--- /dev/null
+++ b/tools/get_cpu_info.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+if [ "`uname -s`" != "Linux" ]; then
+  echo "Current scenario only support in Linux yet!"
+  exit 0
+fi
+
+echo "********** Hardware Information **********"
+sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
+cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
+ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
+physical_cores=$((sockets * cores_per_socket))
+virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
+numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
+echo "CPU Name               : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
+echo "CPU Family             : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
+echo "Socket Number          : $sockets"
+echo "Cores Per Socket       : $cores_per_socket"
+echo "Total Physical Cores   : $physical_cores"
+echo "Total Virtual Cores    : $virtual_cores"
+if [ $ht -eq 1 ]; then
+  echo "Hyper Threading        : OFF"
+  if [ $physical_cores -ne $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+else
+  echo "Hyper Threading        : ON"
+  if [ $physical_cores -ge $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+fi
+echo "NUMA Nodes             : $numa_nodes"
+if [ $numa_nodes -lt $sockets ]; then
+  echo "Warning: NUMA node is not enough for the best performance,\
+ at least $sockets"
+fi
+
+echo "********** Software Information **********"
+echo "OS Version             : `cat /proc/version`"
+echo "Kernel Release Version : `uname -r`"
+echo "Kernel Patch Version   : `uname -v`"
+echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
+if command -v cmake >/dev/null 2>&1; then 
+  cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
+else
+  cmake_ver=" Not installed"
+fi
+echo "CMake Version          :$cmake_ver"
+echo "******************************************"