diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 04befbe1ca01d4bfec5872a63565f21d110a6c67..efe6fa1b2daffcbbaa2b7945e7139ac83f689bcd 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -441,6 +441,7 @@ class SectionWorker : public DeviceWorker {
     skip_vars_ = skip_vars;
   }
   static void ResetBatchId() { batch_id_ = 0; }
+  static void ResetThreadCompletedFlag() { threads_completed = false; }
 
   static std::atomic<int> cpu_id_;
 
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index edd1700ae7284c77883af6abd2cd7d511097685f..df482f43346c57cc59af42936b6a7308b76cbd3a 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -41,6 +41,11 @@ message LocalSGDConfig {
   optional int32 begin_step = 2 [ default = 1 ];
 }
 
+message AdaptiveLocalSGDConfig {
+  optional int32 init_k_steps = 1 [ default = 1 ];
+  optional int32 begin_step = 2 [ default = 1 ];
+}
+
 message GradientMergeConfig {
   optional int32 k_steps = 1 [ default = 1 ];
   optional bool avg = 2 [ default = true ];
@@ -121,6 +126,7 @@ message DistributedStrategy {
   optional bool cudnn_exhaustive_search = 21 [ default = true ];
   optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
   optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
+  optional bool adaptive_localsgd = 24 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
@@ -131,6 +137,7 @@ message DistributedStrategy {
   optional AsyncConfig a_sync_configs = 107;
   optional LarsConfig lars_configs = 108;
   optional LambConfig lamb_configs = 109;
+  optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
   optional BuildStrategy build_strategy = 201;
   optional ExecutionStrategy execution_strategy = 202;
 }
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 7612df9ab915011001b57b37e4fd559d393302a7..3f88a460d140f6d7389194a29e37128f5ba5b458 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -334,3 +335,8 @@ void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
 
 REGISTER_PASS(embedding_eltwise_layernorm_fuse_pass,
               paddle::framework::ir::EmbeddingEltwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("lookup_table", 0)
+            .EQ("elementweise_add", 0));
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
index 71c9dbae1a46af1ecae0aaff3fde52de8142d4bb..727e42629f9fab9183668ae0cc84ae54eb01982c 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
@@ -16,12 +16,13 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-TEST(SkipLayerNormFusePass, basic) {
+TEST(EmbeddingElewiseLayernormFusePass, basic) {
   // inputs                           operator            output
   // --------------------------------------------------------------------
   // (x, y)                       elementwise_add    -> elementwise_out
@@ -91,6 +92,12 @@ TEST(SkipLayerNormFusePass, basic) {
           "The number of fusion nodes does not meet expectations after fuse"));
 }
 
+TEST(EmbeddingElewiseLayernormFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("embedding_eltwise_layernorm_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index 82e0af3c198750296032769f2f3b04658871adb7..f7a8e3e3f6c3c77e978c57eeb7515d8cfce86471 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -84,6 +85,19 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
       VLOG(3) << "do not perform " + type() + "+bias fuse";
       return;
     }
+    if (conv->Op()->HasAttr("dilations")) {
+      auto dilations =
+          BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
+      for (const auto& d : dilations) {
+        if (d != 1) {
+          LOG(WARNING)
+              << "dilation conv not supported in MKLDNN, fuse not apply "
+              << "and set conv attribute use_mkldnn = false";
+          conv->Op()->SetAttr("use_mkldnn", false);
+          return;
+        }
+      }
+    }
 
     auto* eltwise_bias_tensor =
         scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
@@ -151,3 +165,8 @@ REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
               paddle::framework::ir::Conv2DTransposeBiasFusePass);
 REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
               paddle::framework::ir::Conv3DBiasFusePass);
+REGISTER_PASS_CAPABILITY(conv_bias_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index 88aac001a93ae836d62fe3bf3fc502960eebe70f..455350d2f703c52a9ef3e5714a60573408310080 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/platform/place.h"
 
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
@@ -149,6 +150,12 @@ TEST(ConvBiasFusePass, conv2d_transpose) {
   ASSERT_EQ(pass.type(), std::string("conv2d_transpose"));
 }
 
+TEST(ConvBiasFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("conv_bias_mkldnn_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index af2b1308e084ee937f26bf90caf2df6fb44e044b..2fb131aceaad28a365e8202dca35cfe53f8f54da 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include <tuple>
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -341,3 +342,8 @@ void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
 
 REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
               paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
index 8a13596cd50087475bf12b6cfa5920b82e24de31..fd4910fc8e95cd98fe9feaba51e70d5a143ad443 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -17,6 +17,7 @@
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -267,6 +268,12 @@ TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
   AssertOpsCount(graph, 2, 1);
 }
 
+TEST(ConvElementwiseAddMKLDNNFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("conv_elementwise_add_mkldnn_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index c5965701a53d4312d89f1e09f17840b09f1bd5f5..df5ba3314e637fefe930d4c45f431314dd7d8493 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -57,3 +58,7 @@ void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(depthwise_conv_mkldnn_pass,
               paddle::framework::ir::DepthwiseConvMKLDNNPass);
+REGISTER_PASS_CAPABILITY(depthwise_conv_mkldnn_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "depthwise_conv2d", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
index a37565236cd440bd803184d038ad4deb3c0b6150..c6c72ba33d6295d90c502ab88d7d712d76a11aad 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
@@ -16,6 +16,8 @@
 
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -70,6 +72,12 @@ ProgramDesc BuildProgramDesc() {
   return prog;
 }
 
+TEST(DepthwiseConvMKLDNNPass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("depthwise_conv_mkldnn_pass"));
+}
+
 TEST(DepthwiseConvMKLDNNPass, basic) {
   auto prog = BuildProgramDesc();
 
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 198107ea082dc86d9e65a926bf9befe2fc4abfa4..9d2b4ebaf8ccf33e175e46c08657e7eeed467055 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/errors.h"
 
 namespace paddle {
@@ -707,3 +708,13 @@ REGISTER_PASS(multihead_matmul_fuse_pass,
 
 REGISTER_PASS(multihead_matmul_fuse_pass_v2,
               paddle::framework::ir::MultiHeadMatmulV2FusePass);
+REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .EQ("matmul", 0)
+            .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
index d8a06b037bdefbe8776c9b95b36be80afb988393..2eda643d4e53aa061908f02c9d31b765241c318b 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"  // NOLINT
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -133,6 +134,12 @@ TEST(MultiHeadMatmulFusePass, basic) {
                         num_fused_nodes_after));
 }
 
+TEST(MultiHeadMatmulFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("multihead_matmul_fuse_pass_v2"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index 9dddc9154f8fc39144b38535824999b933a92106..2e3cd16d5ce49fdd6186f98c72d77c75c4053559 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -180,3 +181,8 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(skip_layernorm_fuse_pass,
               paddle::framework::ir::SkipLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(skip_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("elementwise_add", 0)
+            .EQ("layer_norm", 0));
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
index d2d7469872857a070294520a589fee4ca383f065..eff5dcddf54ee49be5b14a7bdfa609079f925036 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -54,6 +55,12 @@ TEST(SkipLayerNormFusePass, basic) {
           "The number of fusion nodes does not meet expectations after fuse"));
 }
 
+TEST(SkipLayerNormFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("skip_layernorm_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 758b728fd9cffff6867a46a6c22c86e496103b84..d7506edbf4ca74976f067d879dad7df4923f946f 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -251,6 +251,7 @@ void PipelineTrainer::Finalize() {
   }
   root_scope_->DropKids();
   SectionWorker::ResetBatchId();
+  SectionWorker::ResetThreadCompletedFlag();
 }
 
 Scope* PipelineTrainer::GetWorkerScope(int thread_id) {
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 03b7afbb8771fadbe07a352497fa69a299928cf7..b9a3cac0ec4c9863f4e0dea102965eff8c47fe8d 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -196,7 +196,6 @@ void SectionWorker::TrainFiles() {
         if (threads_completed) {
           VLOG(3) << "thread " << thread_id_ << " completed.";
           lk.unlock();
-          threads_completed = false;
           return;
         }
         lk.unlock();
@@ -459,7 +458,6 @@ void SectionWorker::TrainFilesWithProfiler() {
                     << ", mean_time: " << op_total_time[i] / op_count[i];
           }
           VLOG(0) << "================================";
-          threads_completed = false;
           return;
         }
         lk.unlock();
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index a1b43de469542a1612600a5a22c19c8745179afb..146d5932577fb7f4e2e33f6d98c51756ffd02073 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -9,7 +9,8 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 function(download_data install_dir data_file)
-    if (NOT EXISTS ${install_dir}/${data_file})
+    string(REGEX MATCH "[^/\\]+$" file_name ${data_file})
+    if (NOT EXISTS ${install_dir}/${file_name})
         inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file})
     endif()
 endfunction()
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 3958d3f685470f2505abf0e8bfd269d3834970ae..338e46111fca83230aca1c7877578e557cef5a31 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -54,9 +54,13 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     float average_window = ctx.Attr<float>("average_window");
     int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
     int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
-                      "min_average_window shouldn't be larger than "
-                      "max_average_window");
+    PADDLE_ENFORCE_LE(
+        min_average_window, max_average_window,
+        platform::errors::InvalidArgument(
+            "The min_average_window > "
+            "max_average_window is not right, min_average_window is %ld, "
+            "max_average_window is %ld.",
+            min_average_window, max_average_window));
 
     // Get inputs
     auto* param = ctx.Input<Tensor>("param");
diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
index 1da164175e1daf8e872964d8fad21c4f3dd614e7..cb27dc75eb2faf15746e596265d1b1e4b3717e52 100644
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
@@ -43,9 +43,9 @@ class FakeInitOp : public framework::OperatorBase {
       tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
       tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::InvalidArgument(
           "fake init op's output only"
-          "supports SelectedRows and LoDTensor");
+          "supports SelectedRows and LoDTensor"));
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 5e1e408eb2c28239fded0d0cf037c94783828b50..43de8488a0e4ac24f7e261671488cd88563a805d 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -134,7 +134,10 @@ void ListenAndServOp::RunSyncLoop(
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
   PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 2. Recieved %zu",
+                        num_blocks));
 
   // Prepare all the server block
   std::vector<int> optimize_blocks_list;
@@ -218,7 +221,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
       VLOG(3) << "reset sparse var: " << varname;
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
     } else {
-      PADDLE_THROW("The type of sparse var should be SelectedRows");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "The type of sparse var should be SelectedRows"));
     }
   }
   if (UNLIKELY(reset_all)) {
@@ -235,7 +239,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
         math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
                            static_cast<float>(0));
       } else {
-        PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]");
+        PADDLE_THROW(platform::errors::PreconditionNotMet(
+            "The type of dense var should be in [LoDTensor, Tensor]"));
       }
     }
   }
@@ -254,8 +259,15 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
     std::vector<std::string> pieces;
     split(grad_and_id, ':', &pieces);
     VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
-    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);
+    PADDLE_ENFORCE_EQ(pieces.size(), 2,
+                      platform::errors::PreconditionNotMet(
+                          "Invalid format of grad_and_id argument. "
+                          "Expected \"grad:block_id\". Recieved %s",
+                          grad_and_id.c_str()));
+    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
+                      platform::errors::AlreadyExists(
+                          "The gradient name %s has already existed in out_map",
+                          pieces[0].c_str()));
 
     int block_id = std::stoi(pieces[1]);
     (*out_map)[pieces[0]] = block_id;
@@ -267,7 +279,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 2. Recieved %zu",
+                        num_blocks));
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
     block_list.push_back(blkid);
@@ -342,9 +357,9 @@ void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
                var->IsType<framework::Tensor>()) {
       dense_vars_.push_back(varname);
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
           "The type of received var should be in [SelectedRows, LoDTensor, "
-          "Tensor].");
+          "Tensor]."));
     }
   }
 }
@@ -450,7 +465,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     split(prefetch_var_name_and_id, ':', &pieces);
     VLOG(3) << "after split, prefetch_var = " << pieces[0]
             << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        pieces.size(), 2,
+        platform::errors::PreconditionNotMet(
+            "Invalid format of prefetch_var_name_and_id argument. "
+            "Expected \"xxx:xxx\". Recieved %s",
+            prefetch_var_name_and_id.c_str()));
 
     int block_id = std::stoi(pieces[1]);
     prefetch_block_id_list.push_back(block_id);
@@ -476,7 +496,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
        sparse_grad_name_to_param_name_str) {
     std::vector<std::string> pieces;
     split(sparse_grad_name_and_param_name, ':', &pieces);
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        pieces.size(), 2,
+        platform::errors::PreconditionNotMet(
+            "Invalid format of sparse_grad_name_and_param_name argument. "
+            "Expected \"xxx:xxx\". Recieved %s",
+            sparse_grad_name_and_param_name.c_str()));
     VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
             << ", param_name = " << pieces[1];
     sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
index 5dc93740949e6e7c25be564927c8fcffde1a18d6..721c23e38307fddb63d41baddc395afa94f40336 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -61,8 +61,15 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
                            const framework::Tensor *x,
                            const framework::Tensor *y, framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
-      ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index f539e2e6f6d2d6faa084d1e62ec894b4b65e96bf..3d28ca90a5a15fd53a57034a4722a21842dc4b1c 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -55,31 +55,38 @@ class EmptyOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
 
     if (context->HasInput("ShapeTensor")) {
-      auto dims = context->GetInputDim("ShapeTensor");
+      auto shape_dims = context->GetInputDim("ShapeTensor");
       int num_ele = 1;
-      for (int i = 0; i < dims.size(); ++i) {
-        num_ele *= dims[i];
+      for (int i = 0; i < shape_dims.size(); ++i) {
+        num_ele *= shape_dims[i];
       }
-
-      context->SetOutputDim("Out", framework::make_ddim({num_ele}));
+      auto vec_dims = std::vector<int>(num_ele, -1);
+      context->SetOutputDim("Out", framework::make_ddim(vec_dims));
     } else if (context->HasInputs("ShapeTensorList")) {
       std::vector<int> out_dims;
       auto dims_list = context->GetInputsDim("ShapeTensorList");
       for (size_t i = 0; i < dims_list.size(); ++i) {
         auto& dims = dims_list[i];
-        PADDLE_ENFORCE_EQ(
-            dims, framework::make_ddim({1}),
-            "ShapeError: The shape of Tensor in list must be [1]. "
-            "But received the shape "
-            "is [%s]",
-            dims);
-
-        out_dims.push_back(dims[0]);
+        PADDLE_ENFORCE_EQ(dims, framework::make_ddim({1}),
+                          platform::errors::InvalidArgument(
+                              "The shape of Tensor in list must be [1]. "
+                              "But received the shape is [%s]",
+                              dims));
+
+        out_dims.push_back(-1);
       }
 
       context->SetOutputDim("Out", framework::make_ddim(out_dims));
     } else {
       auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
+      for (size_t i = 0; i < shape.size(); ++i) {
+        PADDLE_ENFORCE_GE(
+            shape[i], 0,
+            platform::errors::InvalidArgument(
+                "Each value of attribute 'shape' is expected to be no less "
+                "than 0. But recieved: shape[%u] = %d; shape = [%s].",
+                i, shape[i], framework::make_ddim(shape)));
+      }
       context->SetOutputDim("Out", framework::make_ddim(shape));
     }
   }
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index 0155ef188ef967fbf67505d28beeeaf956bb3a70..550de1aadde2935fae34226dba78cc06d82cd1f3 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -87,7 +87,10 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
     lod[0].assign(high_level.begin(), high_level.end());
     lod[1].assign(low_level.begin(), low_level.end());
     if (!framework::CheckLoD(lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "lod %s is not right in"
+          " beam_search, please check your code.",
+          framework::LoDToString(lod)));
     }
     selected_ids->set_lod(lod);
     selected_scores->set_lod(lod);
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index cf6d44c1abc531da9d00738bba22f70a4c68bbab..ed3ead47d171efb4128a294c7d7a24324c7187b7 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -400,7 +400,10 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
 
     context.Wait();
     if (!framework::CheckLoD(selected_lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "lod %s is not right in"
+          " beam_search, please check your code.",
+          framework::LoDToString(selected_lod)));
     }
 
     selected_ids->set_lod(selected_lod);
diff --git a/paddle/fluid/operators/math/blas.cc b/paddle/fluid/operators/math/blas.cc
index 6a143b3c056455595fdedc131b0c5f4ee756e1e0..2a7ce83967f0f74f4c2178dd4277e6a1687b5ec7 100644
--- a/paddle/fluid/operators/math/blas.cc
+++ b/paddle/fluid/operators/math/blas.cc
@@ -20,7 +20,11 @@ namespace operators {
 namespace math {
 MatDescriptor CreateMatrixDescriptor(const framework::DDim &tensor_dim,
                                      int num_flatten_cols, bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
+  PADDLE_ENFORCE_GT(
+      tensor_dim.size(), 1,
+      platform::errors::InvalidArgument("The tensor dim size should be greater "
+                                        "than 1, but reveived dim size is %d",
+                                        tensor_dim.size()));
   MatDescriptor retv;
   if (num_flatten_cols > 1) {
     auto flatten_dim = framework::flatten_to_2d(tensor_dim, num_flatten_cols);
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index d0c5f74d4efb8248b41d8b2af285e8dd7ec4d479..a0464cf70e2dcc44c42fc2ca7440680ef8a53e6e 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -60,7 +60,8 @@ struct CUBlas<float> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasSgemmStridedBatched(args...));
 #else
-    PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "SgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -85,7 +86,8 @@ struct CUBlas<float> {
           beta, C, Ctype, ldc));
     });
 #else
-    PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasSgemmEx is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -146,13 +148,15 @@ struct CUBlas<double> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasDgemmStridedBatched(args...));
 #else
-    PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "DgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
   template <typename... ARGS>
   static void GEMM_EX(ARGS... args) {
-    PADDLE_THROW("Currently there are not cublasDgemmEx.");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Currently there are not cublasDgemmEx."));
   }
 
   template <typename... ARGS>
@@ -216,7 +220,8 @@ struct CUBlas<platform::float16> {
         reinterpret_cast<const __half *>(beta), reinterpret_cast<__half *>(C),
         ldc, strideC, batchCount));
 #else
-    PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "HgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -247,7 +252,8 @@ struct CUBlas<platform::float16> {
           beta, C, Ctype, ldc, computeType, algo));
     });
 #else
-    PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
   }
 };
@@ -302,8 +308,12 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
   // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
-                    "cublas fp16 gemm requires GPU compute capability >= 53");
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
 
   float h_alpha = static_cast<float>(alpha);
   float h_beta = static_cast<float>(beta);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 892bf15738141bfbb7e75fa6b37c0cda53a8e098..515d6a2435e86fe07ffe1309628ef2fbeefdc6f0 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -29,7 +29,8 @@ template <>
 struct CBlas<int8_t> {
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_THROW("Blas VCOPY don't support int8_t");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Blas VCOPY do not supported on CPU, please check your code"));
   }
 };
 
@@ -347,22 +348,47 @@ struct CBlas<double> {
 
 template <>
 struct CBlas<platform::float16> {
-  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void GEMM(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 GEMM not supported on CPU, please check your code"));
+  }
+
   static void SMM_GEMM(...) {
-    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 SMM_GEMM not supported on CPU, please check your code"));
   }
-  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
-  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
-  static void VSQUARE(...) {
-    PADDLE_THROW("float16 VSQUARE not supported on CPU");
+  static void VMUL(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VMUL not supported on CPU, please check your code"));
   }
-  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
-  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
-  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
-  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
+  static void VEXP(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VEXP not supported on CPU, please check your code"));
+  }
+  static void VSQUARE(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VSQUARE not supported on CPU, please check your code"));
+  }
+  static void VPOW(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VPOW not supported on CPU, please check your code"));
+  }
+  static void DOT(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 DOT not supported on CPU, please check your code"));
+  };
+  static void SCAL(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 SCAL not supported on CPU, please check your code"));
+  };
+  static void ASUM(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 ASUM not supported on CPU, please check your code"));
+  };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
-    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 GEMM_BATCH not supported on CPU, please check your code"));
   }
 #endif
 };
@@ -446,11 +472,18 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
   auto dim_a = mat_a.dims();
   auto dim_b = mat_b.dims();
   auto dim_out = mat_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-  PADDLE_ENFORCE(
-      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(),
-      "The places of matrices must be same");
+  PADDLE_ENFORCE_EQ(
+      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, true,
+      platform::errors::InvalidArgument(
+          "The input and output of matmul should be matrix, the dim size must "
+          "be 2,"
+          "but received dim size input_a:%d, input_b:%d, output:%d",
+          dim_a.size(), dim_b.size(), dim_out.size()));
+  PADDLE_ENFORCE_EQ(
+      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(), true,
+      platform::errors::InvalidArgument("The places of matrices in the matmul "
+                                        "should be same, please check your "
+                                        "code."));
 
   int M = dim_out[0];
   int N = dim_out[1];
@@ -715,7 +748,13 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMMWithHead(
     }
 
   } else {
-    PADDLE_ENFORCE_EQ(W1, H2);
+    PADDLE_ENFORCE_EQ(
+        W1, H2,
+        platform::errors::InvalidArgument(
+            "The fisrt matrix width should be same as second matrix height,"
+            "but received fisrt matrix width %d"
+            ", second matrix height %d",
+            W1, H2));
     int ldc = W2 * head_number;
     int sub_width = W1 / head_number;
 
@@ -785,7 +824,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                                  const framework::Tensor &mat_b,
                                  const MatDescriptor &dim_b, T alpha,
                                  framework::Tensor *mat_out, T beta) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+  PADDLE_ENFORCE_EQ(
+      dim_a.width_, dim_b.height_,
+      platform::errors::InvalidArgument(
+          "The fisrt matrix width should be same as second matrix height,"
+          "but received fisrt matrix width %d"
+          ", second matrix height %d",
+          dim_a.width_, dim_b.height_));
+
   CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
   if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -793,12 +839,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                            dim_a.width_, alpha, mat_a.data<T>(),
                            mat_b.data<T>(), beta, mat_out->data<T>());
   } else {
-    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0,
-                   "dim_a.batch_size should be equal to dim_b.batch_size, or "
-                   "one of dim_a.batch_size and dim_b.batch_size should be 0. "
-                   "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
-                   dim_a.batch_size_, dim_b.batch_size_);
+    PADDLE_ENFORCE_EQ(
+        dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+            dim_b.batch_size_ == 0,
+        true, platform::errors::InvalidArgument(
+                  "dim_a.batch_size should be equal to dim_b.batch_size, or "
+                  "one of dim_a.batch_size and dim_b.batch_size should be 0. "
+                  "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
+                  dim_a.batch_size_, dim_b.batch_size_));
     this->template BatchedGEMM<T>(
         transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
         mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
@@ -834,15 +882,42 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
                                          int head_number,
                                          framework::Tensor *mat_out, T beta,
                                          bool mat_b_split_vertical) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_ % head_number, 0);
-  PADDLE_ENFORCE_GE(head_number, 1);
-  PADDLE_ENFORCE_LE(head_number, dim_a.width_);
+  PADDLE_ENFORCE_EQ(
+      dim_a.width_ % head_number, 0,
+      platform::errors::InvalidArgument(
+          "The first input width must be some times the head number"
+          "but received first input width %d"
+          ",  head_number %d",
+          dim_a.width_, head_number));
+  PADDLE_ENFORCE_GE(head_number, 1,
+                    platform::errors::InvalidArgument(
+                        "The head number should be greater equal 1,"
+                        "but received head number %d",
+                        head_number));
+  PADDLE_ENFORCE_LE(
+      head_number, dim_a.width_,
+      platform::errors::InvalidArgument(
+          "The head number should be less equal first input width,"
+          "but received first input width %d"
+          ",  head_number %d",
+          dim_a.width_, head_number));
   CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
 
   if (mat_b_split_vertical) {
-    PADDLE_ENFORCE_EQ(dim_b.height_, dim_a.width_ / head_number);
-    PADDLE_ENFORCE_EQ(dim_b.width_ % head_number, 0);
+    PADDLE_ENFORCE_EQ(
+        dim_b.height_, dim_a.width_ / head_number,
+        platform::errors::InvalidArgument(
+            "The second input height should be equal than first input width,"
+            "but received second input height %d, first input width %d",
+            dim_b.height_, dim_a.width_ / head_number));
+    PADDLE_ENFORCE_EQ(
+        dim_a.width_ % head_number, 0,
+        platform::errors::InvalidArgument(
+            "The second input width should be some times the head number"
+            "but received second input width %d"
+            ",  head_number %d",
+            dim_b.width_, head_number));
   }
 
   if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -888,9 +963,16 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
                              mat_out->data<T>() + sub_matC_offset, ldc);
     }
   } else {
-    PADDLE_ENFORCE_EQ((dim_a.batch_size_ == dim_b.batch_size_ ||
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0),
-                      true);
+    PADDLE_ENFORCE_EQ(
+        (dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+         dim_b.batch_size_ == 0),
+        true,
+        platform::errors::InvalidArgument(
+            "The first input batch size should be equal than second input,"
+            "either two input batch size is 0, but received first input batch "
+            "size"
+            " %d, second input batch size %d",
+            dim_a.batch_size_, dim_b.batch_size_));
 
     this->template BatchedGEMMWithHead<T>(
         transA, transB, dim_a.width_, dim_a.height_, dim_b.width_,
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index f44b33fcf2fc23f79483909046dd9e292fd8dde8..b8af5a21ca58185a10b34bca2310dd93436d340c 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -22,10 +22,12 @@ limitations under the License. */
 #include <cblas.h>
 #endif
 
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/float16.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace operators {
@@ -63,6 +65,55 @@ DEFINE_CPU_TRANS(4);
 DEFINE_CPU_TRANS(5);
 DEFINE_CPU_TRANS(6);
 
+template <typename T>
+struct TransposeNormal<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    const T* in_ptr = in.data<T>();
+    T* out_ptr = out->data<T>();
+
+    auto transpose_helper = [&](int64_t beg, int64_t end) {
+      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
+        int64_t in_idx = 0;
+        int64_t tmp_idx = out_idx;
+        // calculate the input index
+        for (int i = 0; i < rank; ++i) {
+          const int64_t coordinate = tmp_idx / out_stride[i];
+          tmp_idx -= coordinate * out_stride[i];
+          in_idx += coordinate * in_stride[axis[i]];
+        }
+        out_ptr[out_idx] = in_ptr[in_idx];
+      }
+    };
+    double cost_per_iteration =
+        rank * (Eigen::TensorOpCost::DivCost<int64_t>() +
+                2 * Eigen::TensorOpCost::MulCost<int64_t>() +
+                2 * Eigen::TensorOpCost::AddCost<int64_t>());
+    Eigen::TensorOpCost cost(sizeof(T), sizeof(T), cost_per_iteration);
+    auto* cpu_device = context.eigen_pool_device();
+    cpu_device->parallelFor(out->numel(), cost, std::move(transpose_helper));
+  }
+};
+
+// define transpose normal
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
+
+DEFINE_CPU_TRANS_NORMAL(platform::float16);
+DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
+DEFINE_CPU_TRANS_NORMAL(float);
+DEFINE_CPU_TRANS_NORMAL(double);
+DEFINE_CPU_TRANS_NORMAL(int);
+DEFINE_CPU_TRANS_NORMAL(int64_t);
+DEFINE_CPU_TRANS_NORMAL(bool);
+DEFINE_CPU_TRANS_NORMAL(int16_t);
+DEFINE_CPU_TRANS_NORMAL(uint8_t);
+DEFINE_CPU_TRANS_NORMAL(int8_t);
+
 struct TensorSetConstantCPU {
   TensorSetConstantCPU(framework::Tensor* tensor, float value)
       : tensor_(tensor), value_(value) {}
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 1c519d226ebfe5ff19876f17b79fd36aa12c4130..4d7c1a49286dd67a314f81fd706bdcc5e340e450 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
@@ -23,6 +26,7 @@ namespace operators {
 namespace math {
 
 using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
 
 template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
@@ -31,12 +35,13 @@ template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
 
-#define DEFINE_GPU_TRANS(RANK)                                           \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>; \
+#define DEFINE_GPU_TRANS(RANK)                                            \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>;    \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;
 
 DEFINE_GPU_TRANS(1);
@@ -46,6 +51,88 @@ DEFINE_GPU_TRANS(4);
 DEFINE_GPU_TRANS(5);
 DEFINE_GPU_TRANS(6);
 
+#define REINTERPRET(T, DST_PTR, SRC_PTR) \
+  T* DST_PTR = reinterpret_cast<T*>(SRC_PTR)
+
+template <typename T>
+__global__ void TransposeNormalKernel(const T* in_ptr, T* out_ptr,
+                                      int64_t element,
+                                      const int64_t* in_stride_ptr,
+                                      const int64_t* out_stride_ptr,
+                                      const int64_t* axis_ptr, int rank) {
+  CUDA_KERNEL_LOOP(out_idx, element) {
+    int64_t in_idx = 0;
+    int64_t tmp_idx = out_idx;
+    for (int i = 0; i < rank; ++i) {
+      const int64_t coordinate = tmp_idx / out_stride_ptr[i];
+      tmp_idx -= coordinate * out_stride_ptr[i];
+      in_idx += coordinate * in_stride_ptr[axis_ptr[i]];
+    }
+    out_ptr[out_idx] = in_ptr[in_idx];
+  }
+}
+
+template <typename T>
+struct TransposeNormal<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    auto* in_ptr = in.data<T>();
+    auto* out_ptr = out->data<T>();
+
+    // copy in_stride, out_stride, axis to gpu device
+    const platform::CUDAPlace& cuda_place =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+    platform::CPUPlace cpu_place = platform::CPUPlace();
+    size_t size = 3 * rank * sizeof(int64_t);
+    auto cpu_buf_holder = memory::AllocShared(cpu_place, size);
+    auto cuda_buf_holder = memory::AllocShared(cuda_place, size);
+    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
+    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
+    for (int i = 0; i < rank; ++i) {
+      cpu_buf[i] = in_stride[i];
+      cpu_buf[rank + i] = out_stride[i];
+      cpu_buf[2 * rank + i] = axis[i];
+    }
+    memory::Copy(cuda_place, cuda_buf, cpu_place, cpu_buf, size,
+                 context.stream());
+    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
+    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
+    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
+
+    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
+    const int MAX_GRID_DIM =
+        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int64_t elements = in.numel();
+    int block_size = (elements >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(elements)));
+    int grid_size = elements / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
+        in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr,
+        rank);
+  }
+};
+
+// define transpose normal
+#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CUDADeviceContext, TYPE>
+
+DEFINE_GPU_TRANS_NORMAL(float16);
+DEFINE_GPU_TRANS_NORMAL(bfloat16);
+DEFINE_GPU_TRANS_NORMAL(float);
+DEFINE_GPU_TRANS_NORMAL(double);
+DEFINE_GPU_TRANS_NORMAL(int);
+DEFINE_GPU_TRANS_NORMAL(int64_t);
+DEFINE_GPU_TRANS_NORMAL(bool);
+DEFINE_GPU_TRANS_NORMAL(int16_t);
+DEFINE_GPU_TRANS_NORMAL(uint8_t);
+DEFINE_GPU_TRANS_NORMAL(int8_t);
+
 struct TensorSetConstantGPU {
   TensorSetConstantGPU(const platform::DeviceContext& context,
                        framework::Tensor* tensor, float value)
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 333552a0c1a79ed1682ea1f6da36f30e2ad5b4bf..6af0278d82503a27a14de6aef96468d69d6c17ad 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -26,6 +26,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace math {
+
+template <typename DeviceContext, typename T>
+struct TransposeNormal {
+  // for dims >= 7 situation
+  void operator()(const DeviceContext& context, const framework::Tensor& in,
+                  framework::Tensor* out, const std::vector<int>& axis);
+};
+
 template <typename DeviceContext, typename T, int Rank>
 struct Transpose {
   void operator()(const DeviceContext& context, const framework::Tensor& in,
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 587823e535ac67f926fd469d2f43df536c8c88b6..3388d7edafecc4c0dd3a041316dc6f171d035319 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -226,8 +226,8 @@ TEST(math_funciton, set_constant) {
   for (int64_t i = 0; i < t.numel(); ++i) {
     PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
                       paddle::platform::errors::InvalidArgument(
-                          "Each value of input"
-                          "tensor should be 10, but received %d.",
+                          "Each value of input tensor should be 10, "
+                          "but received %d.",
                           t.data<int>()[i]));
   }
   delete ctx;
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index de9113f2bb616b489747d8d960154f55bb988847..b90e7e1980335f1facd38738671e7986187d1ceb 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -33,10 +33,10 @@ namespace math {
 class Sampler {
  public:
   explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    PADDLE_ENFORCE_GT(range, 0, platform::errors::InvalidArgument(
-                                    "Range should be"
-                                    " greater than 0, but recevied %d.",
-                                    range));
+    PADDLE_ENFORCE_GT(
+        range, 0,
+        platform::errors::InvalidArgument(
+            "Range should be greater than 0, but recevied %d.", range));
     if (seed == 0) {
       std::random_device r;
       seed_ = r();
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index c05da0062f2bab66746feb9d8ebedeca0c0f9688..794fc647172b040d4e926144a87b84eb4e5216b0 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -34,16 +34,15 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* col,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(
-        vol.dims().size(), 4,
-        platform::errors::InvalidArgument("The dimension of"
-                                          " vol should be 4, but received %d.",
-                                          vol.dims().size()));
-    PADDLE_ENFORCE_EQ(
-        col->dims().size(), 7,
-        platform::errors::InvalidArgument("The dimension of"
-                                          "col should be 7, but received %d.",
-                                          col->dims().size()));
+    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol.dims().size()));
+
+    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col should be 7, but received %d.",
+                          col->dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -152,16 +151,15 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* vol,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(
-        vol->dims().size(), 4,
-        platform::errors::InvalidArgument("The dimension of vol"
-                                          " should be 4, but received %d.",
-                                          vol->dims().size()));
-    PADDLE_ENFORCE_EQ(
-        col.dims().size(), 7,
-        platform::errors::InvalidArgument("The dimension of col"
-                                          " should be 7, but received %d.",
-                                          col.dims().size()));
+    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol->dims().size()));
+
+    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col  should be 7, but received %d.",
+                          col.dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -192,29 +190,29 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
                             ((dilations[0] * (filter_depth - 1) + 1))) /
                                strides[0] +
                            1;
-    PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
-                      platform::errors::InvalidArgument(
-                          "input_depth(%d)"
-                          " and output_depth(%d) are mismatching.",
-                          input_depth_tmp, output_depth));
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
     auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
                              ((dilations[1] * (filter_height - 1) + 1))) /
                                 strides[1] +
                             1;
-    PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
-                      platform::errors::InvalidArgument(
-                          "input_height(%d)"
-                          " and output_height(%d) are mismatching.",
-                          input_height_tmp, output_height));
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
     auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
                             ((dilations[2] * (filter_width - 1) + 1))) /
                                strides[2] +
                            1;
-    PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
-                      platform::errors::InvalidArgument(
-                          "input_width(%d)"
-                          " and output_width(%d) are mismatching.",
-                          input_width_tmp, output_width));
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d)  and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
     T* vol_data = vol->data<T>();
     const T* col_data = col.data<T>();
 
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index fe5a600909893b8313d470923ef4d43eae155e76..eca39e919737210267d7af1856903d3e1fc697d1 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -90,16 +90,14 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* col,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(
-        vol.dims().size(), 4,
-        platform::errors::InvalidArgument("The dimension of"
-                                          " vol should be 4, but received %d.",
-                                          vol.dims().size()));
-    PADDLE_ENFORCE_EQ(
-        col->dims().size(), 7,
-        platform::errors::InvalidArgument("The dimension of"
-                                          "col should be 7, but received %d.",
-                                          col->dims().size()));
+    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of  vol should be 4, but received %d.",
+                          vol.dims().size()));
+    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col should be 7, but received %d.",
+                          col->dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -253,16 +251,14 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* vol,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(
-        vol->dims().size(), 4,
-        platform::errors::InvalidArgument("The dimension of vol"
-                                          " should be 4, but received %d.",
-                                          vol->dims().size()));
-    PADDLE_ENFORCE_EQ(
-        col.dims().size(), 7,
-        platform::errors::InvalidArgument("The dimension of col"
-                                          " should be 7, but received %d.",
-                                          col.dims().size()));
+    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol  should be 4, but received %d.",
+                          vol->dims().size()));
+    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col  should be 7, but received %d.",
+                          col.dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -291,29 +287,29 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
                             ((dilations[0] * (filter_depth - 1) + 1))) /
                                strides[0] +
                            1;
-    PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
-                      platform::errors::InvalidArgument(
-                          "input_depth(%d)"
-                          " and output_depth(%d) are mismatching.",
-                          input_depth_tmp, output_depth));
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
     auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
                              ((dilations[1] * (filter_height - 1) + 1))) /
                                 strides[1] +
                             1;
-    PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
-                      platform::errors::InvalidArgument(
-                          "input_height(%d)"
-                          " and output_height(%d) are mismatching.",
-                          input_height_tmp, output_height));
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
     auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
                             ((dilations[2] * (filter_width - 1) + 1))) /
                                strides[2] +
                            1;
-    PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
-                      platform::errors::InvalidArgument(
-                          "input_width(%d)"
-                          " and output_width(%d) are mismatching.",
-                          input_width_tmp, output_width));
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
 
     int num_kernels = input_channels * input_depth * input_height * input_width;
 
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 3cafb0e9fc6147626f066bbeba1b10d074a37b87..b2815cbdc65b53beba9cdb1864d10875d5db5e62 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -86,8 +86,10 @@ class ConcatPrimitiveFactory {
   concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd,
                                Tensor* output, platform::CPUPlace place,
                                const mkldnn::engine& mkldnn_engine) {
-    dst_mem = mkldnn::memory(concat_pd.dst_desc(), mkldnn_engine,
-                             output->mutable_data<T>(place));
+    dst_mem = mkldnn::memory(
+        concat_pd.dst_desc(), mkldnn_engine,
+        output->mutable_data<T>(place, concat_pd.dst_desc().get_size()));
+
     return concat(concat_pd);
   }
 
@@ -193,7 +195,9 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         prim_creator.SetSrcDataHandleByIndex(
             *srcs, i, to_void_cast<T>(multi_input[i]->data<T>()));
       }
-      prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data<T>(place));
+      prim_creator.SetDstDataHandle(
+          *dst_mem,
+          output->mutable_data<T>(place, concat_pd->dst_desc().get_size()));
     }
 
     mkldnn::stream astream(mkldnn_engine);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 67a19cb83c36f9cb6ef0cdd65e9fc04a7bb4d169..25f9453571ac632e70b0755ca1e5566eb5bf6ee6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -18,9 +18,10 @@ limitations under the License. */
 #include <set>
 #include <string>
 #include <vector>
-
 #include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 
 namespace paddle {
@@ -34,6 +35,110 @@ namespace operators {
   }
 
 using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims,
+                           const std::vector<int>& reduced_dims,
+                           std::vector<int>* perm_axis) {
+  // check if it's a reduced dim
+  std::vector<bool> src_dims_check(src_dims.size(), false);
+  size_t src_size = src_dims.size();
+  size_t reduce_size = reduced_dims.size();
+  for (size_t i = 0; i < reduce_size; ++i) {
+    dst_dims->at(src_size - reduce_size + i) = src_dims[reduced_dims[i]];
+    (*perm_axis)[src_size - reduce_size + i] = reduced_dims[i];
+    src_dims_check[reduced_dims[i]] = true;
+  }
+
+  size_t offset = 0;
+  for (size_t i = 0; i < src_dims_check.size(); ++i) {
+    bool is_reduced = src_dims_check[i];
+    if (!is_reduced) {
+      (*perm_axis)[offset] = i;
+      dst_dims->at(offset++) = src_dims[i];
+    }
+  }
+}
+
+template <typename DeviceContext, typename OutT>
+void GetShuffledInput(const framework::ExecutionContext& context,
+                      const Tensor* input, Tensor* shuffled_input,
+                      const std::vector<int>& dims) {
+  DDim shuffled_dims(input->dims());
+  std::vector<int> perm_axis(input->dims().size());
+  GetShuffledDim(input->dims(), &shuffled_dims, dims, &perm_axis);
+
+  shuffled_input->Resize(shuffled_dims);
+  shuffled_input->mutable_data<OutT>(context.GetPlace());
+
+  math::TransposeNormal<DeviceContext, OutT> trans;
+  trans(context.template device_context<DeviceContext>(), *input,
+        shuffled_input, perm_axis);
+}
+
+inline void GetOriginDimFromShuffled(const DDim& src_dim,
+                                     const std::vector<int>& dims,
+                                     std::vector<int>* origin_dim) {
+  DDim shuffled_dims(src_dim);
+  size_t n = src_dim.size();
+  std::vector<int> perm_axis(n);
+  GetShuffledDim(src_dim, &shuffled_dims, dims, &perm_axis);
+  for (size_t i = 0; i < n; ++i) {
+    (*origin_dim)[perm_axis[i]] = i;
+  }
+}
+
+template <typename DeviceContext, typename OutT, typename Functor>
+void HandleLargeDim(const framework::ExecutionContext& context,
+                    const Tensor* input, Tensor* output,
+                    const std::vector<int>& dims, bool keep_dim) {
+  //  shuffle the reduced dim to the end
+  Tensor shuffled_input;
+  GetShuffledInput<DeviceContext, OutT>(context, input, &shuffled_input, dims);
+
+  // transpose to 2D tensor whose shape is {unreduced, reduced}.
+  const int64_t unreduced = output->numel();
+  const int64_t reduced = shuffled_input.numel() / unreduced;
+  shuffled_input.Resize({unreduced, reduced});
+  DDim output_dim = output->dims();
+  output->Resize({unreduced});
+  ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
+      context.template device_context<DeviceContext>(), shuffled_input, output,
+      {1}, keep_dim);
+  output->Resize(output_dim);
+}
+
+template <typename DeviceContext, typename T, typename Functor>
+void HandleLargeDimGrad(const framework::ExecutionContext& context,
+                        const framework::Tensor* x,
+                        const framework::Tensor* out,
+                        const framework::Tensor* dout, framework::Tensor* dx,
+                        const std::vector<int>& dims) {
+  const int64_t unreduced = out->numel();
+  const int64_t reduced = x->numel() / unreduced;
+  DDim out_dim(out->dims());
+  DDim x_dim(x->dims());
+  // transpose and reshape X
+  Tensor shuffled_x;
+  GetShuffledInput<DeviceContext, T>(context, x, &shuffled_x, dims);
+  DDim shuffled_dim = shuffled_x.dims();
+  shuffled_x.Resize({unreduced, reduced});
+  // reshape dX {unreduced, reduced}
+  dx->Resize({unreduced, reduced});
+  ReduceGradFunctor<DeviceContext, T, 2, Functor>(
+      context.template device_context<DeviceContext>(), shuffled_x, *out, *dout,
+      dx, {1});
+  // transpose dX
+  std::vector<int> origin_axis(x_dim.size());
+  GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
+  Tensor dx_tmp;
+  framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
+  dx_tmp.Resize(shuffled_dim);
+  dx->Resize(x_dim);
+  math::TransposeNormal<DeviceContext, T> trans;
+  trans(context.template device_context<DeviceContext>(), dx_tmp, dx,
+        origin_axis);
+}
 
 template <typename DeviceContext, typename T, typename Functor>
 struct ReduceKernelFunctor {
@@ -69,22 +174,27 @@ struct ReduceKernelFunctor {
     } else {
       int ndim = input->dims().size();
       int rdim = dims.size();
-      HANDLE_DIM(6, 5);
-      HANDLE_DIM(6, 4);
-      HANDLE_DIM(6, 3);
-      HANDLE_DIM(6, 2);
-      HANDLE_DIM(6, 1);
-      HANDLE_DIM(5, 4);
-      HANDLE_DIM(5, 3);
-      HANDLE_DIM(5, 2);
-      HANDLE_DIM(5, 1);
-      HANDLE_DIM(4, 3);
-      HANDLE_DIM(4, 2);
-      HANDLE_DIM(4, 1);
-      HANDLE_DIM(3, 2);
-      HANDLE_DIM(3, 1);
-      HANDLE_DIM(2, 1);
-      HANDLE_DIM(1, 1);
+      if (ndim > 6) {
+        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
+                                                     dims, keep_dim);
+      } else {
+        HANDLE_DIM(6, 5);
+        HANDLE_DIM(6, 4);
+        HANDLE_DIM(6, 3);
+        HANDLE_DIM(6, 2);
+        HANDLE_DIM(6, 1);
+        HANDLE_DIM(5, 4);
+        HANDLE_DIM(5, 3);
+        HANDLE_DIM(5, 2);
+        HANDLE_DIM(5, 1);
+        HANDLE_DIM(4, 3);
+        HANDLE_DIM(4, 2);
+        HANDLE_DIM(4, 1);
+        HANDLE_DIM(3, 2);
+        HANDLE_DIM(3, 1);
+        HANDLE_DIM(2, 1);
+        HANDLE_DIM(1, 1);
+      }
     }
   }
 };
@@ -137,7 +247,6 @@ class ReduceKernel : public framework::OpKernel<T> {
     }
   }
 };
-
 template <typename DeviceContext, typename OutT, typename Functor>
 class BoolReduceKernel : public framework::OpKernel<OutT> {
  public:
@@ -175,22 +284,27 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
       int ndim = input->dims().size();
       int rdim = dims.size();
       // comments for accelerating compiling temporarily.
-      //      HANDLE_DIM(6, 5);
-      //      HANDLE_DIM(6, 4);
-      //      HANDLE_DIM(6, 3);
-      //      HANDLE_DIM(6, 2);
-      //      HANDLE_DIM(6, 1);
-      //      HANDLE_DIM(5, 4);
-      //      HANDLE_DIM(5, 3);
-      //      HANDLE_DIM(5, 2);
-      //      HANDLE_DIM(5, 1);
-      HANDLE_DIM(4, 3);
-      HANDLE_DIM(4, 2);
-      HANDLE_DIM(4, 1);
-      HANDLE_DIM(3, 2);
-      HANDLE_DIM(3, 1);
-      HANDLE_DIM(2, 1);
-      HANDLE_DIM(1, 1);
+      if (ndim > 6) {
+        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
+                                                     dims, keep_dim);
+      } else {
+        HANDLE_DIM(6, 5);
+        HANDLE_DIM(6, 4);
+        HANDLE_DIM(6, 3);
+        HANDLE_DIM(6, 2);
+        HANDLE_DIM(6, 1);
+        HANDLE_DIM(5, 4);
+        HANDLE_DIM(5, 3);
+        HANDLE_DIM(5, 2);
+        HANDLE_DIM(5, 1);
+        HANDLE_DIM(4, 3);
+        HANDLE_DIM(4, 2);
+        HANDLE_DIM(4, 1);
+        HANDLE_DIM(3, 2);
+        HANDLE_DIM(3, 1);
+        HANDLE_DIM(2, 1);
+        HANDLE_DIM(1, 1);
+      }
     }
   }
 };
@@ -279,6 +393,10 @@ class ReduceGradKernel : public framework::OpKernel<T> {
               context.template device_context<DeviceContext>(), *input0,
               *input1, *input2, output, dims);
           break;
+        default:
+          HandleLargeDimGrad<DeviceContext, T, Functor>(context, input0, input1,
+                                                        input2, output, dims);
+          break;
       }
     }
   }
@@ -313,12 +431,6 @@ class ReduceOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceOp");
     auto x_dims = ctx->GetInputDim("X");
     auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimensions of ReduceOp "
-                          "should be less equal than 6. But received X's "
-                          "dimensions = %d, X's shape = [%s].",
-                          x_rank, x_dims));
     auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
     PADDLE_ENFORCE_GT(dims.size(), 0,
                       platform::errors::InvalidArgument(
@@ -402,11 +514,6 @@ class ReduceGradOp : public framework::OperatorWithKernel {
                    "Out@GRAD", "ReduceOp");
     auto x_dims = ctx->GetInputDim("X");
     auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6,
-                      platform::errors::InvalidArgument(
-                          "Tensors with rank at most 6 are supported by "
-                          "ReduceOp. Received tensor with rank %d.",
-                          x_rank));
     auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
     for (size_t i = 0; i < dims.size(); ++i) {
       PADDLE_ENFORCE_LT(dims[i], x_rank,
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 62bffe630484e3ab30bedcf2324f6516bca3b27e..0ecf9bfb5d8c0ccadbdd8b7a0b8f6193d4dc5310 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -68,6 +68,6 @@ REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int32_t>,
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
                        ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
                        ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
index 4b9dca0d4028be36ad8ba46ebe35db101e003ee9..5d50b17818cbb8068db2fded1f3f4e76bad44430 100644
--- a/paddle/fluid/operators/shape_op.cu
+++ b/paddle/fluid/operators/shape_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/shape_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
-    shape, paddle::operators::ShapeKernel<int>,
-    paddle::operators::ShapeKernel<int32_t>,
+    shape, paddle::operators::ShapeKernel<bool>,
+    paddle::operators::ShapeKernel<int>,
     paddle::operators::ShapeKernel<int64_t>,
     paddle::operators::ShapeKernel<float>,
     paddle::operators::ShapeKernel<double>,
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index d7f5c3dd457c90eefc4181cdbc662196a046853e..e4e5dfdba9f6057161051e551d1f6711ba0cd4e9 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -53,10 +53,9 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
       trans6(dev_ctx, in, out, axis);
       break;
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Tensors with rank at most 6 are supported"
-          ", but received input tensor's rank is %d,",
-          dim));
+      // for dim >= 7 situation
+      math::TransposeNormal<DeviceContext, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
   }
 }
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 29982c13c8ca88bc8b4a168f92e4116a283a97e8..34305c404b4df72ed28547e46817be00d6722a42 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include <set>
 #include <string>
+#include <thread>  //NOLINT
 #include <unordered_set>
 #include <vector>
 
@@ -23,6 +24,7 @@ limitations under the License. */
 #endif
 
 #include "glog/logging.h"
+#include "unsupported/Eigen/CXX11/ThreadPool"
 
 namespace paddle {
 namespace memory {
@@ -131,16 +133,31 @@ DeviceContextPool::DeviceContextPool(
 
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
+  InitPoolDevice();
 }
 
 CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
   eigen_device_.reset(new Eigen::DefaultDevice());
+  InitPoolDevice();
+}
+
+void CPUDeviceContext::InitPoolDevice() {
+  using EigenEnv = Eigen::StlThreadEnvironment;
+  using EigenThreadPool = Eigen::ThreadPoolTempl<EigenEnv>;
+  int num_threads = std::thread::hardware_concurrency();
+  eigen_threadpool_.reset(new EigenThreadPool(num_threads));
+  eigen_pool_device_.reset(
+      new Eigen::ThreadPoolDevice(eigen_threadpool_.get(), num_threads));
 }
 
 Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
+Eigen::ThreadPoolDevice* CPUDeviceContext::eigen_pool_device() const {
+  return eigen_pool_device_.get();
+}
+
 Place CPUDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 8bfdfc8a1c6033a79c197e1cd425197f77079bda..28d94627f9575573075beaa328a682314b5c3b71 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -41,6 +41,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #endif
+#define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
 
 #ifdef PADDLE_WITH_XPU
@@ -65,11 +66,17 @@ class CPUDeviceContext : public DeviceContext {
 
   Eigen::DefaultDevice* eigen_device() const;
 
+  Eigen::ThreadPoolDevice* eigen_pool_device() const;
+
   Place GetPlace() const override;
 
+  inline void InitPoolDevice();
+
  private:
   CPUPlace place_;
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_pool_device_;
+  std::unique_ptr<Eigen::ThreadPool> eigen_threadpool_;
 };
 
 template <typename Place>
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ec07565c5af6c7ba79c15d9a335313775719c682..3de577d847dd027c4778ccee760d5298501bad80 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -621,6 +621,7 @@ function generate_upstream_develop_api_spec() {
     git checkout -b develop_base_pr upstream/$BRANCH
     cmake_gen $1
     build $2
+    cp ${PADDLE_ROOT}/python/requirements.txt /tmp
 
     git checkout $cur_branch
     generate_api_spec "$1" "DEV"
@@ -641,7 +642,12 @@ function generate_api_spec() {
     cd ${PADDLE_ROOT}/build/.check_api_workspace
     virtualenv .${spec_kind}_env
     source .${spec_kind}_env/bin/activate
-    pip install -r ${PADDLE_ROOT}/python/requirements.txt
+
+    if [ "$spec_kind" == "DEV" ]; then
+        pip install -r /tmp/requirements.txt
+    else
+        pip install -r ${PADDLE_ROOT}/python/requirements.txt
+    fi
     pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl
     spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
@@ -930,6 +936,10 @@ function parallel_test_base_gpu() {
 EOF
 
 set +x
+        precison_cases=""
+        if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
+            precision_cases=`python $PADDLE_ROOT/tools/get_pr_ut.py`
+        fi
         EXIT_CODE=0;
         test_cases=$(ctest -N -V) # get all test cases
         exclusive_tests=''        # cases list which would be run exclusively
@@ -959,10 +969,23 @@ set +x
                     echo $testcase" will only run at night."
                     continue
                 fi
+                if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]]; then
+                    will_test="false"
+                    for case in $precision_cases; do
+                        if [[ $testcase == $case ]]; then
+                            will_test="true"
+                            break
+                        fi
+                    done
+                    if [[ $will_test == "false" ]]; then
+                        echo $testcase" won't run in PRECISION_TEST mode."
+                        continue
+                    fi
+                fi
 
                 if [[ "$is_multicard" == "" ]]; then
                   # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
-                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist")
+                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
                 fi
 
                 if [[ "$is_exclusive" != "" ]]; then
@@ -1077,8 +1100,6 @@ set +x
                 done
         fi
 
-
-       
         if [[ "$EXIT_CODE" != "0" ]]; then
             if [[ "$failed_test_lists" == "" ]]; then
                 echo "========================================"
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 016726633ea355ed20149e94833ca7e1657c3f7d..661471599cb080da7a65c11fecc339830f2c00ee 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -77,6 +77,7 @@ from .tensor.creation import triu  #DEFINE_ALIAS
 from .tensor.creation import tril  #DEFINE_ALIAS
 from .tensor.creation import meshgrid  #DEFINE_ALIAS
 from .tensor.creation import empty  #DEFINE_ALIAS
+from .tensor.creation import empty_like  #DEFINE_ALIAS
 from .tensor.linalg import matmul  #DEFINE_ALIAS
 from .tensor.linalg import dot  #DEFINE_ALIAS
 # from .tensor.linalg import einsum        #DEFINE_ALIAS
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 1b86056c00443be4170757cee3cc60bbafd0f40b..f1c836468daf36db753c67a3e09757be728d37a7 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -728,6 +728,63 @@ class DistributedStrategy(object):
                           "localsgd_configs")
         assign_configs_value(self.strategy.localsgd_configs, configs)
 
+    @property
+    def adaptive_localsgd(self):
+        """
+        Indicating whether we are using Adaptive Local SGD training. Default Value: False
+        For more details, please refer to `Adaptive Communication Strategies to Achieve 
+        the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
+
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.adaptive_localsgd = True # by default this is false
+
+        """
+        return self.strategy.localsgd
+
+    @adaptive_localsgd.setter
+    @is_strict_auto
+    def adaptive_localsgd(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.localsgd = flag
+        else:
+            print("WARNING: adaptive_localsgd should have value of bool type")
+
+    @property
+    def adaptive_localsgd_configs(self):
+        """
+        Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable
+        setting that can be configured through a dict.
+
+        **Notes**:
+            init_k_steps(int) The initial steps for training before adaptive localsgd.
+                              Then, the adaptive localsgd method will modify init_k_steps automatically.
+                              Default 1.
+            begin_step(int) The step of begining training by adaptive localsgd. Default 1.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.adaptive_localsgd = True
+            strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
+                                                  "begin_step": 30}
+        """
+
+        return get_msg_dict(self.strategy.adaptive_localsgd_configs)
+
+    @adaptive_localsgd_configs.setter
+    @is_strict_auto
+    def adaptive_localsgd_configs(self, configs):
+        check_configs_key(self.strategy.adaptive_localsgd_configs, configs,
+                          "adaptive_localsgd_configs")
+        assign_configs_value(self.strategy.adaptive_localsgd_configs, configs)
+
     @property
     def dgc(self):
         """
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 2a3f52be613e7215c21db6cc75724e2ac6d02ba8..dc6e7c9b52b9a0ecf1d86a19b537d746091c0b5f 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -611,25 +611,31 @@ class Fleet(object):
     @dygraph_only
     def distributed_model(self, model):
         """
-        Return dygraph distributed data parallel model (Layer)
-        Only work in dygraph mode
+        Return distributed data parallel model (Only work in dygraph mode)
+
+        Args:
+            model (Layer): the user-defind model which inherits Layer.
+
+        Returns:
+            distributed data parallel model which inherits Layer.
 
         Examples:
+
             .. code-block:: python
-            import paddle
-            import paddle.nn as nn
-            from paddle.distributed import fleet
 
-            class LinearNet(nn.Layer):
-                def __init__(self):
-                    super(LinearNet, self).__init__()
-                    self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
+                import paddle
+                import paddle.nn as nn
+                from paddle.distributed import fleet
+
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
 
-                def forward(self, x):
-                    return self._linear2(self._linear1(x))
+                    def forward(self, x):
+                        return self._linear2(self._linear1(x))
 
-            def train():
                 # 1. enable dynamic mode
                 paddle.disable_static()
 
@@ -661,8 +667,7 @@ class Fleet(object):
                 adam.step()
                 adam.clear_grad()
 
-            if __name__ == '__main__':
-                paddle.distributed.spawn(train)
+
         """
         assert model is not None
         self.model = paddle.DataParallel(model)
@@ -672,29 +677,30 @@ class Fleet(object):
     def state_dict(self):
         """
         Get state dict information from optimizer.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
 
         Returns: 
             state_dict(dict) : dict contains all the Tensor used by optimizer
 
         Examples:
             .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
 
-            paddle.disable_static()
-            fleet.init(is_collective=True)
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
+
+                paddle.disable_static()
+                fleet.init(is_collective=True)
 
-            value = np.arange(26).reshape(2, 13).astype("float32")
-            a = paddle.fluid.dygraph.to_variable(value)
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
 
-            layer = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
 
-            adam = fleet.distributed_optimizer(adam)
-            dp_layer = fleet.distributed_model(layer)
-            state_dict = adam.state_dict()
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+                state_dict = adam.state_dict()
         """
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.state_dict()
@@ -703,34 +709,36 @@ class Fleet(object):
     def set_state_dict(self, state_dict):
         """
         Load optimizer state dict.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
 
         Args: 
             state_dict(dict) : Dict contains all the Tensor needed by optimizer
 
-        Returns: None 
+        Returns:
+            None
 
         Examples:
             .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
 
-            paddle.disable_static()
-            fleet.init(is_collective=True)
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
+
+                paddle.disable_static()
+                fleet.init(is_collective=True)
 
-            value = np.arange(26).reshape(2, 13).astype("float32")
-            a = paddle.fluid.dygraph.to_variable(value)
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
 
-            layer = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
 
-            adam = fleet.distributed_optimizer(adam)
-            dp_layer = fleet.distributed_model(layer)
-            state_dict = adam.state_dict()
-            paddle.framework.save(state_dict, "paddle_dy")
-            para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
-            adam.set_state_dict(opti_state_dict)
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+                state_dict = adam.state_dict()
+                paddle.framework.save(state_dict, "paddle_dy")
+                para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
+                adam.set_state_dict(opti_state_dict)
         """
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.set_state_dict(state_dict)
@@ -739,42 +747,44 @@ class Fleet(object):
     def set_lr(self, value):
         """
         Set the value of the learning rate manually in the optimizer. 
-        Only work in dygraph mode
+        (Only work in dygraph mode)
 
         Args:
             value (float|Tensor): the value of learning rate
 
-        Returns: None 
+        Returns: 
+            None 
 
         Examples:
             .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
 
-            paddle.disable_static()
-            fleet.init(is_collective=True)
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
 
-            value = np.arange(26).reshape(2, 13).astype("float32")
-            a = paddle.fluid.dygraph.to_variable(value)
+                paddle.disable_static()
+                fleet.init(is_collective=True)
 
-            layer = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
 
-            adam = fleet.distributed_optimizer(adam)
-            dp_layer = fleet.distributed_model(layer)
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
 
-            lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
-            for i in range(5):
-                adam.set_lr(lr_list[i])
-                lr = adam.get_lr()
-                print("current lr is {}".format(lr))
-            # Print:
-            #    current lr is 0.2
-            #    current lr is 0.3
-            #    current lr is 0.4
-            #    current lr is 0.5
-            #    current lr is 0.6
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+                for i in range(5):
+                    adam.set_lr(lr_list[i])
+                    lr = adam.get_lr()
+                    print("current lr is {}".format(lr))
+                # Print:
+                #    current lr is 0.2
+                #    current lr is 0.3
+                #    current lr is 0.4
+                #    current lr is 0.5
+                #    current lr is 0.6
         """
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.set_lr(value)
@@ -783,31 +793,32 @@ class Fleet(object):
     def get_lr(self):
         """
         Get current step learning rate.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
 
         Returns:
             float: The learning rate of the current step.
 
         Examples:
             .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
 
-            paddle.disable_static()
-            fleet.init(is_collective=True)
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
+
+                paddle.disable_static()
+                fleet.init(is_collective=True)
 
-            value = np.arange(26).reshape(2, 13).astype("float32")
-            a = paddle.fluid.dygraph.to_variable(value)
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
 
-            layer = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
 
-            adam = fleet.distributed_optimizer(adam)
-            dp_layer = fleet.distributed_model(layer)
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
 
-            lr = adam.get_lr()
-            print(lr) # 0.01
+                lr = adam.get_lr()
+                print(lr) # 0.01
         """
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.get_lr()
@@ -816,27 +827,27 @@ class Fleet(object):
     def step(self):
         """
         Execute the optimizer once.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
 
-        Returns: None
+        Returns:
+            None
 
         Examples:
             .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-            from paddle.distributed import fleet
+                import paddle
+                import paddle.nn as nn
+                from paddle.distributed import fleet
 
-            class LinearNet(nn.Layer):
-                def __init__(self):
-                    super(LinearNet, self).__init__()
-                    self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
 
-                def forward(self, x):
-                    return self._linear2(self._linear1(x))
+                    def forward(self, x):
+                        return self._linear2(self._linear1(x))
 
-            def train():
                 # 1. enable dynamic mode
                 paddle.disable_static()
 
@@ -868,8 +879,6 @@ class Fleet(object):
                 adam.step()
                 adam.clear_grad()
 
-            if __name__ == '__main__':
-                paddle.distributed.spawn(train)
 
         """
         # imitate target optimizer retrieval
@@ -878,28 +887,28 @@ class Fleet(object):
     @dygraph_only
     def clear_grad(self):
         """
-        Execute the optimizer once.
-        Only work in dygraph mode
+        Clear the gradients of all optimized parameters for model.
+        (Only work in dygraph mode)
 
-        Returns: None
+        Returns: 
+            None
 
         Examples:
             .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-            from paddle.distributed import fleet
+                import paddle
+                import paddle.nn as nn
+                from paddle.distributed import fleet
 
-            class LinearNet(nn.Layer):
-                def __init__(self):
-                    super(LinearNet, self).__init__()
-                    self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
 
-                def forward(self, x):
-                    return self._linear2(self._linear1(x))
+                    def forward(self, x):
+                        return self._linear2(self._linear1(x))
 
-            def train():
                 # 1. enable dynamic mode
                 paddle.disable_static()
 
@@ -931,8 +940,6 @@ class Fleet(object):
                 adam.step()
                 adam.clear_grad()
 
-            if __name__ == '__main__':
-                paddle.distributed.spawn(train)
         """
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.clear_grad()
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 8614b1861343b8e48b55a8e75d9e432ef6329184..a3a809ee375f879d0b5903ac02e28f15212e00a8 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -637,7 +637,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         return "lo"
 
     def __start_kv_server(self, http_server_d, size_d):
-        from paddle.distributed.fleet.utils import KVServer
+        from paddle.distributed.fleet.utils.http_server import KVServer
         http_server = KVServer(int(self._http_ip_port[1]), size_d)
         http_server.start()
         wait_seconds = 5
@@ -651,6 +651,7 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
     def __init__(self, is_collective=False, init_gloo=False, **kwargs):
         super(UserDefinedRoleMaker, self).__init__(
             is_collective=is_collective, init_gloo=init_gloo, **kwargs)
+        self._init_gloo = init_gloo
 
     def _user_defined_ps_env(self):
         self._server_endpoints = self._kwargs.get("server_endpoints")
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index f5a6c417c0c45bea819c5832f98b5b6c9fabbd4b..4fa247c319616d0a2f1ffbc2d26753dbd278f12f 100644
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -16,20 +16,18 @@
 """basic collective operations in python"""
 """remote file system"""
 
-__all__ = ['UtilBase']
-
-import numpy as np
-import os
-
-import subprocess
-from paddle.fluid import core
-from collections import OrderedDict
-import paddle.fluid as fluid
-from google.protobuf import text_format
-from paddle.fluid import debugger
-from paddle.fluid.framework import Program
-from paddle.fluid.proto import framework_pb2
 from ..utils.fs import FS, LocalFS, HDFSClient
+from paddle.fluid.proto import framework_pb2
+from paddle.fluid.framework import Program
+from paddle.fluid import debugger
+from google.protobuf import text_format
+import paddle.fluid as fluid
+from collections import OrderedDict
+from paddle.fluid import core
+import subprocess
+import os
+import numpy as np
+__all__ = ['UtilBase']
 
 
 class UtilFactory(object):
@@ -53,7 +51,7 @@ class UtilBase(object):
     def _set_role_maker(self, role_maker):
         self.role_maker = role_maker
 
-    def set_file_system(self, fs_client):
+    def _set_file_system(self, fs_client):
         assert isinstance(
             fs_client, FS
         ), "fs_client must be the instance of paddle.distributed.fleet.utils.FS"
@@ -87,36 +85,183 @@ class UtilBase(object):
         return _comm_world
 
     def all_reduce(self, input, mode, comm_world="worker"):
+        """
+        All reduce `input` between specified collection. This is a distributed API.
+
+        Args:
+            input (list|numpy.array): The input variable to do all_reduce between specified collection.
+            mode (str): "sum" or "min" or "max".
+            comm_world (str, optional): Collection used to execute all_reduce operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+
+        Returns:
+            output(Numpy.array|None): A numpy array with the same shape as the `input` .
+
+        Examples:
+            .. code-block:: python
+
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+                import numpy as np
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        input = [1, 2]
+                        output = fleet_util.all_reduce(input, "sum", "server")
+                        print(output)
+                        # [2, 4]
+                    elif fleet.is_worker():
+                        input = np.array([3, 4])
+                        output = fleet_util.all_reduce(input, "sum", "worker")
+                        print(output)
+                        # [6, 8]
+                    output = fleet_util.all_reduce(input, "sum", "all")
+                    print(output)
+                    # [8, 12]
+                if __name__ == "__main__":
+                    train()
+        """
         _comm_world = self.__check_comm_world(comm_world)
         return self.role_maker._all_reduce(_comm_world, input, mode)
 
     def barrier(self, comm_world="worker"):
+        """
+        Barrier between specified collection.
+
+        Args:
+            comm_world (str, optional): Collection used to execute barrier operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+
+        Examples:
+
+            .. code-block:: python
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        fleet_util.barrier("server")
+                        print("all server arrive here")
+                    elif fleet.is_worker():
+                        fleet_util.barrier("worker")
+                        print("all server arrive here")
+                    fleet_util.barrier("all")
+                    print("all servers and workers arrive here")
+
+                if __name__ == "__main__":
+                    train()
+        """
         _comm_world = self.__check_comm_world(comm_world)
         self.role_maker._barrier(_comm_world)
 
     def all_gather(self, input, comm_world="worker"):
+        """
+        All gather `input` between specified collection.
+
+        Args:
+            input (Int|Float): The input variable to do all_gather between specified collection.
+            comm_world (str, optional): Collection used to execute all_reduce operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+
+        Returns:
+            output (List): A list of gathered values.
+
+        Examples:
+
+            .. code-block:: python
+
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        input = fleet.server_index()
+                        output = fleet_util.all_gather(input, "server")
+                        print(output)
+                        # output = [0, 1]
+                    elif fleet.is_worker():
+                        input = fleet.worker_index()
+                        output = fleet_util.all_gather(input, "worker")
+                        # output = [0, 1]
+                        print(output)
+                    output = fleet_util.all_gather(input, "all")
+                    print(output)
+                    # output = [0, 1, 0, 1]
+
+                if __name__ == "__main__":
+                    train()
+        """
         _comm_world = self.__check_comm_world(comm_world)
         return self.role_maker._all_gather(_comm_world, input)
 
-    def broadcast(self):
+    def _broadcast(self):
         pass
 
-    def scatter(self):
+    def _scatter(self):
         pass
 
     def get_file_shard(self, files):
         """
-        split files before distributed training,
-        example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
-                   0 gets [a, b, c] and trainer 1 gets [d, e].
-        example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
-                   [a], trainer 1 gets [b],  trainer 2 gets []
+        Split files before distributed training, and return filelist assigned to the current trainer.
+
+        .. code-block:: text
+
+            example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
+                    0 gets [a, b, c] and trainer 1 gets [d, e].
+            example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
+                    [a], trainer 1 gets [b],  trainer 2 gets []
 
         Args:
-            files(list): file list need to be read.
+            files(list): File list need to be read.
 
         Returns:
-            list: files belongs to this worker.
+            List: Files belong to this worker.
+
+        Examples:
+
+            .. code-block:: python
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet.base.role_maker as role_maker
+
+                role = role_maker.UserDefinedRoleMaker(
+                    is_collective=False,
+                    init_gloo=False,
+                    current_id=0,
+                    role=role_maker.Role.WORKER,
+                    worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+                    server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+                fleet_util._set_role_maker(role)
+                files = fleet_util.get_file_shard(["file1", "file2", "file3"])
+                # files = ["file1", "file2"]
         """
         if not isinstance(files, list):
             raise TypeError("files should be a list of file need to be read.")
@@ -140,6 +285,30 @@ class UtilBase(object):
         return trainer_files[trainer_id]
 
     def print_on_rank(self, message, rank_id):
+        """
+        Woker of rank `rank_id` print some message. 
+
+        Args:
+            message(str): Log to be printed.
+            rank_id(int): trainer id.
+
+        Examples:
+
+            .. code-block:: python
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet.base.role_maker as role_maker
+
+                role = role_maker.UserDefinedRoleMaker(
+                    is_collective=False,
+                    init_gloo=False,
+                    current_id=0,
+                    role=role_maker.Role.WORKER,
+                    worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+                    server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+                fleet_util._set_role_maker(role)
+                fleet_util.print_on_rank("I'm worker 0", 0)
+        """
         if self.role_maker.worker_index() != rank_id:
             return
         print(message)
@@ -297,7 +466,7 @@ class UtilBase(object):
         with fluid.scope_guard(scope):
             inference_program, feed_target_names, fetch_targets = \
                 fluid.io.load_inference_model(config.dump_model_dir, exe, model_filename=model_filename,
-                                            params_filename=config.save_params_filename)
+                                              params_filename=config.save_params_filename)
 
             # check program vars and saved vars shape
             orig_para_shape = {
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 6dba385c569be75b5b83e0a63e560ffa8ab73696..a527393f6024b27095819e256538a39a240ca7cb 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -87,7 +87,7 @@ def _parse_args():
 see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
 ''')
 
-    #Optional arguments for the launch helper
+    # Optional arguments for the launch helper
     parser.add_argument(
         "--ips",
         type=str,
@@ -115,7 +115,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         default="log",
         help="The path for each process's log.If it's not set, the log will printed to default pipe."
     )
-    #positional
+    # positional
     parser.add_argument(
         "training_script",
         type=str,
@@ -124,7 +124,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         "followed by all the arguments for the "
         "training script")
 
-    #rest from the training program
+    # rest from the training program
     parser.add_argument('training_script_args', nargs=REMAINDER)
     return parser.parse_args()
 
@@ -138,7 +138,7 @@ def get_cluster_from_args(args, gpus):
 
     # node_ip = args.node_ip
     assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
-                % (node_ip, node_ips)
+        % (node_ip, node_ips)
     node_rank = node_ips.index(node_ip)
 
     logger.debug("parsed from args: node_ips:{} node_ip:{} node_rank:{}".format(
@@ -280,7 +280,7 @@ def launch_ps(args):
         _, current_node_ip = get_host_name_ip()
 
     assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
-                % (current_node_ip, node_ips)
+        % (current_node_ip, node_ips)
     node_rank = node_ips.index(current_node_ip)
     logger.debug(
         "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}".
@@ -323,10 +323,12 @@ def launch_ps(args):
     for idx, cur_server in enumerate(pod.servers):
         proc_env = {
             "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
+            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
             "PADDLE_PORT": cur_server.endpoint.split(":")[1],
             "TRAINING_ROLE": "PSERVER",
             "PADDLE_TRAINERS_NUM": str(worker_num),
-            "POD_IP": cur_server.endpoint.split(":")[0]
+            "POD_IP": cur_server.endpoint.split(":")[0],
+            "PADDLE_WITH_GLOO": "1"
         }
         current_env.update(proc_env)
 
@@ -365,7 +367,8 @@ def launch_ps(args):
             "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
             "PADDLE_TRAINERS_NUM": str(worker_num),
             "TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(cur_worker.rank)
+            "PADDLE_TRAINER_ID": str(cur_worker.rank),
+            "PADDLE_WITH_GLOO": "1"
         }
         current_env.update(proc_env)
 
@@ -430,7 +433,11 @@ def launch():
         co_arg for co_arg in collective_args
         if co_arg in " ".join(sys.argv[1:-1])
     ]
-    cuda_device_num = fluid.core.get_cuda_device_count()
+    if fluid.core.is_compiled_with_cuda():
+        cuda_device_num = fluid.core.get_cuda_device_count()
+    else:
+        cuda_device_num = 0
+
     if len(has_ps_args) > 0 or cuda_device_num == 0:
         logger.info(
             "Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index d98b2ef3e2a083861647b2847bafad3b08c86cfd..a3a2dee70387d69b9e8e09cd86d69a76890d7a1f 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -18,6 +18,7 @@ from .graph_execution_optimizer import GraphExecutionOptimizer
 from .parameter_server_optimizer import ParameterServerOptimizer
 from .pipeline_optimizer import PipelineOptimizer
 from .localsgd_optimizer import LocalSGDOptimizer
+from .localsgd_optimizer import AdaptiveLocalSGDOptimizer
 from .lars_optimizer import LarsOptimizer
 from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
 from .dgc_optimizer import DGCOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 31a9913701c3e08f5268d578d09c15f5bf8a86f8..ad96e1426694f090943bdd08902e5e2219d32eda 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -22,9 +22,13 @@ class AMPOptimizer(MetaOptimizerBase):
         self.amp_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = [
-            "LarsOptimizer", "LambOptimizer", "RecomputeOptimizer",
-            "LocalSGDOptimizer", "GradientMergeOptimizer",
-            "GraphExecutionOptimizer"
+            "LarsOptimizer",
+            "LambOptimizer",
+            "RecomputeOptimizer",
+            "LocalSGDOptimizer",
+            "GradientMergeOptimizer",
+            "GraphExecutionOptimizer",
+            "AdaptiveLocalSGDOptimizer",
         ]
         self.meta_optimizers_black_list = ["DGCOptimizer"]
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 6fa34d8d28a907d936500907db3e4c65ab4f4da8..4ebac20888dd708bd90f91abdef4a472bac2847c 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -25,7 +25,10 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         super(LocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         self.meta_optimizers_white_list = []
-        self.meta_optimizers_black_list = ["GraphExecutionOptimizer"]
+        self.meta_optimizers_black_list = [
+            "GraphExecutionOptimizer",
+            "AdaptiveLocalSGDOptimizer",
+        ]
         self.snapshot_key = '@SNAPSHOT'
 
     def _can_apply(self):
@@ -186,3 +189,252 @@ class LocalSGDOptimizer(MetaOptimizerBase):
 
             layers.cond(step > begin_step, begin_localsgd, communicate)
         return minimized
+
+
+class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(AdaptiveLocalSGDOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = [
+            "GraphExecutionOptimizer", "LocalSGDOptimizer"
+        ]
+        self.snapshot_key = '@SNAPSHOT'
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
+        if not self.user_defined_strategy.adaptive_localsgd:
+            return False
+
+        if self.role_maker.worker_num() <= 1:
+            return False
+
+        return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
+            or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.adaptive_localsgd = False
+        dist_strategy.adaptive_localsgd_configs = {}
+
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.adaptive_localsgd = True
+        dist_strategy.adaptive_localsgd_configs = {
+            "init_k_steps": 1,
+            "begin_step": 1
+        }
+
+    def snapshot_name(self, param_name):
+        return param_name + self.snapshot_key
+
+    def create_snapshot_vars(self, program):
+        block = program.global_block()
+
+        non_dist_params = []
+        for param in block.iter_parameters():
+            if not param.is_distributed:
+                non_dist_params.append(param)
+
+        p2s = []
+        for param in non_dist_params:
+            snapshot = block.create_var(
+                name=self.snapshot_name(param.name),
+                shape=param.shape,
+                persistable=True,
+                stop_gradient=True,
+                dtype=param.dtype)
+            p2s.append([param, snapshot])
+        return p2s
+
+    def init_snapshot_vars(self, startup_program, param2snapshot):
+        with program_guard(startup_program):
+            for param, snapshot in param2snapshot:
+                layers.assign(param, snapshot)
+
+    def _generate_avg_loss(self, program_block, loss, avg_loss):
+        program_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': [loss]},
+            outputs={'Out': [avg_loss]},
+            attrs={
+                'ring_id': 0,
+                OP_ROLE_KEY: OpRole.Optimize,
+                'use_calc_stream': True
+            })
+        program_block.append_op(
+            type='c_sync_calc_stream',
+            inputs={'X': [avg_loss]},
+            outputs={'Out': [avg_loss]},
+            attrs={OP_ROLE_KEY: OpRole.Optimize})
+
+        program_block.append_op(
+            type='scale',
+            inputs={'X': [avg_loss]},
+            outputs={'Out': [avg_loss]},
+            attrs={
+                'scale': 1.0 / self.role_maker.worker_num(),
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        minimized = self.inner_opt.minimize(
+            loss, startup_program=startup_program)
+
+        init_k_steps = self.user_defined_strategy.adaptive_localsgd_configs[
+            'init_k_steps']
+        begin_step_value = self.user_defined_strategy.adaptive_localsgd_configs[
+            'begin_step']
+
+        if startup_program is None:
+            startup_program = default_startup_program()
+        main_block = loss.block
+
+        self.nrings = 2
+        collective_helper = CollectiveHelper(self.role_maker, self.nrings)
+        collective_helper.update_startup_program(startup_program)
+        p2s = self.create_snapshot_vars(startup_program)
+        self.init_snapshot_vars(startup_program, p2s)
+
+        p2s = self.create_snapshot_vars(main_block.program)
+        with program_guard(main_block.program, startup_program):
+            step = layers.autoincreased_step_counter(begin=1)
+
+            k_steps = layers.create_global_var(
+                name="k_steps",
+                shape=[1],
+                value=int(init_k_steps),
+                dtype='int64',
+                persistable=True)
+
+            begin_step = layers.create_global_var(
+                name="begin_step",
+                shape=[1],
+                value=int(begin_step_value),
+                dtype='int64',
+                persistable=True)
+
+            last_step = layers.create_global_var(
+                name="last_step",
+                shape=[1],
+                value=int(0),
+                dtype='int64',
+                persistable=True)
+
+            avg_loss = layers.create_global_var(
+                name="avg_loss",
+                shape=[1],
+                value=float(0),
+                dtype=loss.dtype,
+                persistable=True)
+
+            lr_0 = layers.create_global_var(
+                name="lr_0",
+                shape=[1],
+                value=float(0),
+                dtype='float32',
+                persistable=True)
+
+            loss_0 = layers.create_global_var(
+                name="loss_0",
+                shape=[1],
+                value=float(0),
+                dtype='float32',
+                persistable=True)
+
+            global_lr = self.inner_opt._global_learning_rate()
+
+            def initialize():
+                self._generate_avg_loss(main_block, loss, avg_loss)
+                layers.assign(avg_loss, loss_0)
+                layers.assign(global_lr, lr_0)
+
+            layers.cond(step == 1, initialize)
+
+            def communicate():
+                sub_block = default_main_program().current_block()
+                ring_id = -1
+                for param, snapshot in p2s:
+                    sub_block.append_op(
+                        type='elementwise_sub',
+                        inputs={'X': [snapshot],
+                                'Y': [param]},
+                        outputs={'Out': [param]},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(
+                        type='c_sync_calc_stream',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    ring_id = (ring_id + 1) % self.nrings
+                    sub_block.append_op(
+                        type='c_allreduce_sum',
+                        inputs={'X': [param]},
+                        outputs={'Out': [param]},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+
+                for ring_id in range(self.nrings):
+                    sub_block.append_op(
+                        type='c_sync_comm_stream',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+
+                for param, snapshot in p2s:
+                    sub_block.append_op(
+                        type='scale',
+                        inputs={'X': [param]},
+                        outputs={'Out': [param]},
+                        attrs={
+                            'scale': 1.0 / self.role_maker.worker_num(),
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+                    sub_block.append_op(
+                        type='elementwise_sub',
+                        inputs={'X': [snapshot],
+                                'Y': [param]},
+                        outputs={'Out': [param]},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(
+                        type='assign',
+                        inputs={'X': [param]},
+                        outputs={'Out': [snapshot]},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                layers.assign(step, last_step)
+
+            def communicate_avg_loss():
+                communicate()
+                self._generate_avg_loss(main_block, loss, avg_loss)
+                next_local_steps = layers.cast(
+                    layers.ceil(
+                        layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) *
+                                    float(init_k_steps))),
+                    dtype='int64')
+                max_local_steps = layers.fill_constant(
+                    shape=[1], dtype='int64', value=16)
+                min_local_steps = layers.fill_constant(
+                    shape=[1], dtype='int64', value=1)
+                next_local_steps = layers.elementwise_min(next_local_steps,
+                                                          max_local_steps)
+                next_local_steps = layers.elementwise_max(next_local_steps,
+                                                          min_local_steps)
+                layers.assign(next_local_steps, k_steps)
+
+            def begin_localsgd():
+                layers.cond(step - last_step == k_steps, communicate_avg_loss)
+
+            layers.cond(step > begin_step, begin_localsgd, communicate)
+
+        return minimized
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index f1911408c84a9dde56a8674e88e0fb8ad575cae7..abf198b97e6e818e1fbe59006f98492640bcee54 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -11,8 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .fs import *
-from .http_server import KVHandler, KVHTTPServer, KVServer
-
-#__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 2c963983b3cea3612baa3e531d96ee33a58e72ad..d821c190f62cdd3874fbfd90deec6653d27a84a1 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -32,10 +32,7 @@ import functools
 from pathlib import PurePosixPath, Path
 import shutil
 
-__all__ = [
-    'FS', 'LocalFS', 'HDFSClient', 'ExecuteError', 'FSTimeOut',
-    'FSFileExistsError', 'FSFileNotExistsError', 'FSShellCmdAborted'
-]
+__all__ = ['LocalFS', 'HDFSClient']
 
 
 class ExecuteError(Exception):
@@ -117,7 +114,37 @@ class FS(object):
 
 
 class LocalFS(FS):
+    """
+    A tool of local file system.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.distributed.fleet.utils.fs import LocalFS
+
+            client = LocalFS()
+            subdirs, files = client.ls_dir("./")
+    """
+
     def ls_dir(self, fs_path):
+        """	
+        List directorys and files under `fs_path` .
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Tuple: Return a 2-tuple, the first is a list of all its subdirectories, 
+            and the second is a list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                subdirs, files = client.ls_dir("./")
+        """
         if not self.is_exist(fs_path):
             return [], []
 
@@ -132,11 +159,46 @@ class LocalFS(FS):
         return dirs, files
 
     def mkdirs(self, fs_path):
+        """
+        Create a remote HDFS directory.
+
+        Args:
+            fs_path(str): The local directory path.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_mkdirs")
+                client.delete("test_mkdirs")
+        """
         assert not os.path.isfile(fs_path), "{} is already a file".format(
             fs_path)
         os.system("mkdir -p {}".format(fs_path))
 
     def rename(self, fs_src_path, fs_dst_path):
+        """
+        Rename the file.
+
+        Args:
+            fs_src_path(str): The actual name of the file or directory
+            fs_dst_path(str): The new name of the file or directory.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_rename_src")
+                print(client.is_exists("test_rename_src")) # True
+                client.rename("test_rename_src", "test_rename_dst")
+                print(client.is_exists("test_rename_src")) # False
+                print(client.is_exists("test_rename_dst")) # True
+                client.delete("test_rename_dst")
+        """
         os.rename(fs_src_path, fs_dst_path)
 
     def _rmr(self, fs_path):
@@ -146,6 +208,21 @@ class LocalFS(FS):
         os.remove(fs_path)
 
     def delete(self, fs_path):
+        """
+        Delete the local file path, whether it's a file or directory.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_localFS_mkdirs")
+                client.delete("test_localFS_mkdirs")
+        """
         if not self.is_exist(fs_path):
             return
 
@@ -158,15 +235,88 @@ class LocalFS(FS):
         return False
 
     def is_file(self, fs_path):
+        """
+        Whether the local file path is a file.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a file, otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_is_file")
+                print(client.is_file("test_is_file")) # True
+                client.delete("test_is_file")
+        """
         return os.path.isfile(fs_path)
 
     def is_dir(self, fs_path):
+        """
+        Whether the local file path is a directory.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a directory, otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_is_dir")
+                print(client.is_dir("test_is_file")) # True
+                client.delete("test_is_dir")
+        """
         return os.path.isdir(fs_path)
 
     def is_exist(self, fs_path):
+        """
+        Whether the local file path exists.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Wheter it's a file or directory, return true if the path exists, 
+            otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                ret = local_fs.is_exist("test_is_exist")
+        """
         return os.path.exists(fs_path)
 
     def touch(self, fs_path, exist_ok=True):
+        """
+        Create a local file.
+
+        Args:
+            fs_path(str): The local file path.
+            exist_ok(bool): When `fs_path` exists, if `exist_ok` is set false,
+            program will throw an Exception. Default is true.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_touch")
+                client.delete("test_touch")
+        """
         if self.is_exist(fs_path):
             if exist_ok:
                 return
@@ -175,6 +325,26 @@ class LocalFS(FS):
         return Path(fs_path).touch(exist_ok=True)
 
     def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
+        """
+        Move a local file or directory from `src_path` to `dst_path` .
+
+        Args:
+            src_path(str):  Name of the file or directory, that's needed to be moved.
+            dst_path(str):  Name of the file or directory to which to move to.
+            overwrite(bool): Whether to re-write `dst_path` if that exists. Default is False.
+            test_exists(bool): Check the existence of `src_path` and `dst_path` . 
+            When `test_exists` is set true, if `src_path` doesn't exist or `dst_path` exists, program will throw an Excetption. 
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_mv_src")
+                client.mv("test_mv_src", "test_mv_dst")
+                client.delete("test_mv_dst")
+        """
         if not self.is_exist(src_path):
             raise FSFileNotExistsError
 
@@ -188,7 +358,21 @@ class LocalFS(FS):
 
     def list_dirs(self, fs_path):
         """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
+        Only list directorys under `fs_path` .
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            List: A list of all its subdirectories, e.g. [subdirname1, subdirname1, ...].
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                subdirs = client.list_dirs("./")
         """
         if not self.is_exist(fs_path):
             return []
@@ -200,26 +384,6 @@ class LocalFS(FS):
         return dirs
 
 
-"""HDFS Utils."""
-
-
-def _handle_errors(f):
-    def handler(*args, **kwargs):
-        start = time.time()
-        while True:
-            try:
-                return f(*args, **kwargs)
-            except ExecuteError as e:
-                o = args[0]
-                time_out = float(o._time_out) / 1000.0
-                inter = float(o._sleep_inter) / 1000.0
-                if time.time() - start >= time_out:
-                    raise FSTimeOut
-                time.sleep(inter)
-
-    return functools.wraps(f)(handler)
-
-
 def _handle_errors(max_time_out=None):
     def decorator(f):
         @functools.wraps(f)
@@ -237,7 +401,7 @@ def _handle_errors(max_time_out=None):
             while True:
                 try:
                     return f(*args, **kwargs)
-                #important: only ExecuteError need to retry
+                # important: only ExecuteError need to retry
                 except ExecuteError as e:
                     if time.time() - start >= time_out:
                         raise FSTimeOut("args:{} timeout:{}".format(
@@ -256,12 +420,36 @@ def _handle_errors(max_time_out=None):
 
 
 class HDFSClient(FS):
+    """
+    A tool of HDFS.
+
+    Args:
+        hadoop_home(str): Hadoop home. 
+        configs(dict): Hadoop config. It is a dictionary and needs to contain the
+            keys: "fs.default.name" and "hadoop.job.ugi".
+
+    Examples:
+
+        .. code-block:: text
+
+            from paddle.distributed.fleet.utils.fs import HDFSClient
+            hadoop_home = "/home/client/hadoop-client/hadoop/"
+
+            configs = {
+                "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                "hadoop.job.ugi": "hello,hello123"
+            }
+
+            client = HDFSClient(hadoop_home, configs)
+            client.ls_dir("hdfs:/test_hdfs_client")
+    """
+
     def __init__(
             self,
             hadoop_home,
             configs,
-            time_out=5 * 60 * 1000,  #ms
-            sleep_inter=1000):  #ms
+            time_out=5 * 60 * 1000,  # ms
+            sleep_inter=1000):  # ms
         # Raise exception if JAVA_HOME not exists.
         java_home = os.environ["JAVA_HOME"]
 
@@ -292,6 +480,30 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def list_dirs(self, fs_path):
+        """	
+        Only list directorys under `fs_path` .
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            List: A list of all its subdirectories, e.g. [subdirname1, subdirname1, ...].
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                subdirs = client.list_dirs("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return []
 
@@ -301,7 +513,29 @@ class HDFSClient(FS):
     @_handle_errors()
     def ls_dir(self, fs_path):
         """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
+        List directorys and files under `fs_path` .
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories, 
+            and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                subdirs, files = client.ls_dir("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return [], []
@@ -340,6 +574,30 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def is_dir(self, fs_path):
+        """
+        Whether the remote HDFS path is a directory.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a directory, otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_file("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return False
 
@@ -358,6 +616,30 @@ class HDFSClient(FS):
         return True
 
     def is_file(self, fs_path):
+        """
+        Whether the remote HDFS path is a file.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a file, otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_file("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return False
 
@@ -365,6 +647,31 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def is_exist(self, fs_path):
+        """
+        Whether the remote HDFS path exists.
+
+        Args:
+            fs_path(str): The hdfs file path.
+
+        Returns:
+            Bool: Whether it's is file or directory, return true if the path exists,
+            otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_exist("hdfs:/test_hdfs_client")
+        """
         cmd = "ls {} ".format(fs_path)
         ret, out = self._run_cmd(cmd, redirect_stderr=True)
         if ret != 0:
@@ -377,6 +684,28 @@ class HDFSClient(FS):
 
     # can't retry
     def upload(self, local_path, fs_path):
+        """
+        Upload the local path to remote HDFS.
+
+        Args:
+            local_path(str): The local path.
+            fs_path(str): The HDFS path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.upload("test_hdfs_client", "hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             raise FSFileExistsError("{} exists".format(fs_path))
 
@@ -400,6 +729,28 @@ class HDFSClient(FS):
 
     # can't retry
     def download(self, fs_path, local_path):
+        """
+        Download remote HDFS path to the local.
+
+        Args:
+            fs_path(str):  The HDFS path.
+            local_path(str): The local path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.download("hdfs:/test_hdfs_client", "./")
+        """
         if not self.is_exist(fs_path):
             raise FSFileNotExistsError("{} not exits".format(fs_path))
 
@@ -420,6 +771,27 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def mkdirs(self, fs_path):
+        """
+        Create a remote HDFS directory.
+
+        Args:
+            fs_path(str): The HDFS directory path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.mkdirs("hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             return
 
@@ -442,6 +814,30 @@ class HDFSClient(FS):
                 raise ExecuteError(cmd)
 
     def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
+        """
+        Move a remote HDFS file or directory from `fs_src_path` to `fs_dst_path` .
+
+        Args:
+            fs_src_path(str):  Name of the file or directory, that's needed to be moved.
+            fs_dst_path(str):  Name of the file or directory to which to move to.
+            overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
+            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption. 
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.mv("hdfs:/test_hdfs_client", "hdfs:/test_hdfs_client2")
+        """
         if overwrite and self.is_exist(fs_dst_path):
             self.delete(fs_dst_path)
 
@@ -484,6 +880,27 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def delete(self, fs_path):
+        """
+        Delete a remote HDFS path, whether it's a file or directory.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.delete("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return
 
@@ -494,6 +911,27 @@ class HDFSClient(FS):
         return self._rm(fs_path)
 
     def touch(self, fs_path, exist_ok=True):
+        """
+        Create a remote HDFS file.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.touch("hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             if exist_ok:
                 return
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 8d399c929018f08eb3d02e50981566705536bbf5..7b276293638189d304e5c33b2cd4497bb4256bab 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -67,6 +67,7 @@ class ImperativeQuantAware(object):
         Examples:
         .. code-block:: python
 
+            import paddle
             from paddle.fluid.contrib.slim.quantization \
                 import ImperativeQuantAware
             from paddle.vision.models \
@@ -86,13 +87,12 @@ class ImperativeQuantAware(object):
             # ...
             
             # Save quant model for the inference.
-            imperative_qat.save_quantized_model(
-                dirname="./resnet50_qat",
-                model=model,
-                input_shape=[(3, 224, 224)],
-                input_dtype=['float32'],
-                feed=[0],
-                fetch=[0])
+            paddle.jit.save(
+                layer=model,
+                model_path="./resnet50_qat",
+                input_spec=[
+                    paddle.static.InputSpec(
+                    shape=[None, 3, 224, 224], dtype='float32')])
         """
         super(ImperativeQuantAware, self).__init__()
         self._weight_bits = weight_bits
@@ -148,75 +148,6 @@ class ImperativeQuantAware(object):
             quant_layer = self._get_quantized_counterpart(layer)
             setattr(obj, target, quant_layer)
 
-    def save_quantized_model(self,
-                             dirname,
-                             model,
-                             input_shape,
-                             input_dtype,
-                             feed,
-                             fetch,
-                             append_batch_size=True):
-        """
-        Save the quantized model for the inference.
-
-        Args:
-            dirname (str): the directory to save the quantized model.
-            model(fluid.dygraph.Layer): the quantized model to be saved.
-            input_shape(list[tuple(int)]): The shape value for each input,
-                e.g. [(3, 224, 224)].
-            input_dtype(list[str]): The dtype value for each input,
-                e.g. ['float32'].
-            feed(list[int]): the indices of the input variables of the
-                imperative functions which will be saved as input variables in
-                inference model.
-            fetch(list[int]): the indices of the returned variable of the
-                imperative functions which will be saved as output variables in
-                inference model.
-            append_batch_size(bool, optional):
-                If true, it prepends an extra axis to the input_shape, meanwhile,
-                the input_shape shouldn't contain the batch size dimension.
-                Otherwise, it just uses the input_shape. Default True.
-        Returns:
-            None
-        """
-        assert isinstance(
-            input_shape, list), "The parameter `input_shape` shoubld be a list."
-        assert isinstance(
-            input_dtype, list), "The parameter `input_dtype` shoubld be a list."
-        assert isinstance(feed, list), "The parameter `feed` shoubld be a list."
-        assert isinstance(fetch,
-                          list), "The parameter `fetch` shoubld be a list."
-        assert len(input_shape) == len(
-            input_dtype
-        ), "The length of input_shape should be equal to  input_dtype's."
-        assert len(input_dtype) == len(
-            feed), "The length of input_shape should be equal to  feed's."
-
-        with dygraph.guard():
-            model.eval()
-            input_vars = []
-            for i, (shape, dtype) in enumerate(zip(input_shape, input_dtype)):
-                if append_batch_size:
-                    shape = [None] + list(shape)
-                # Note(Aurelius84): need a elegant way to name this.
-                in_spec = paddle.static.InputSpec(shape, dtype, 'feed_%d' % i)
-                input_vars.append(in_spec)
-            # use `declarative` to convert dygraph into static program
-            model.forward = dygraph.jit.declarative(
-                model.forward, input_spec=input_vars)
-            outputs = model.forward.concrete_program.outputs
-        input_spec = [input_vars[i] for i in feed]
-        configs = dygraph.jit.SaveLoadConfig()
-        configs.separate_params = True
-        if not isinstance(outputs, (tuple, list)):
-            outputs = [outputs]
-        configs.output_spec = [outputs[i] for i in fetch]
-        dygraph.jit.save(
-            layer=model,
-            model_path=dirname,
-            input_spec=input_spec,
-            configs=configs)
-
     def _get_quantized_counterpart(self, layer):
         quant_layers = tuple(self._quant_layers_map.values())
         quantized_counterpart = tuple('Quantized' + k
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 79b0bbd6a4dd3850f49aa0b5124e9be86d4e6ee3..f076d274b643367a2703910dfa6899c5bfd1317c 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -221,7 +221,7 @@ class TestImperativeQat(unittest.TestCase):
             model_dict = lenet.state_dict()
             fluid.save_dygraph(model_dict, "save_temp")
 
-            # test the correctness of `save_quantized_model`
+            # test the correctness of `paddle.jit.save`
             data = next(test_reader())
             test_data = np.array([x[0].reshape(1, 28, 28)
                                   for x in data]).astype('float32')
@@ -231,13 +231,14 @@ class TestImperativeQat(unittest.TestCase):
 
         # save inference quantized model
         path = "./mnist_infer_model"
-        imperative_qat.save_quantized_model(
-            dirname=path,
-            model=lenet,
-            input_shape=[(1, 28, 28)],
-            input_dtype=['float32'],
-            feed=[0],
-            fetch=[0])
+        paddle.jit.save(
+            layer=lenet,
+            model_path=path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
         else:
@@ -245,7 +246,10 @@ class TestImperativeQat(unittest.TestCase):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names, fetch_targets] = (
             fluid.io.load_inference_model(
-                dirname=path, executor=exe))
+                dirname=path,
+                executor=exe,
+                model_filename="__model__",
+                params_filename="__variables__"))
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
@@ -332,13 +336,13 @@ class TestImperativeQat(unittest.TestCase):
                 if batch_id % 100 == 0:
                     _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
 
-        imperative_qat.save_quantized_model(
-            dirname="./dynamic_mnist",
-            model=lenet,
-            input_shape=[(1, 28, 28)],
-            input_dtype=['float32'],
-            feed=[0],
-            fetch=[0])
+        paddle.jit.save(
+            layer=lenet,
+            model_path="./dynamic_mnist",
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
 
         # static graph train
         _logger.info(
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index a05aa3b0a84b57bb1f9ce00b0ad007280c316c6e..2e3bb6b00218a47467c59a2a88ea45ec80c32419 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -39,6 +39,11 @@ try:
         third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
         os.environ['path'] = third_lib_path + ';' + os.environ['path']
         sys.path.insert(0, third_lib_path)
+        # Note: from python3.8, PATH will not take effect
+        # https://github.com/python/cpython/pull/12302
+        # Use add_dll_directory to specify dll resolution path
+        if sys.version_info[:2] >= (3, 8):
+            os.add_dll_directory(third_lib_path)
 
 except ImportError as e:
     from .. import compat as cpt
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 2f95c2b9007a53483fda86dda8d77e9baff0d8d2..01c2f0fed496081400d363d9464360c69d924be8 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -23,7 +23,6 @@ from paddle.fluid import framework
 from paddle.fluid.multiprocess_utils import CleanupFuncRegistrar
 from .tracer import Tracer
 import logging
-import objgraph
 from ..data_feeder import convert_dtype
 import warnings
 
@@ -368,24 +367,6 @@ def guard(place=None):
                     yield
 
 
-def _print_debug_msg(parameter_list, limit=5, is_test=False):
-    if not core._is_dygraph_debug_enabled():
-        logging.warn(
-            'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
-        )
-        return
-    unique_name_size = len(framework.unique_name.generator.ids)
-    tracer_var_size = len(parameter_list)
-    alive_cpp_var_size = len(core.VarBase._alive_vars())
-    if not is_test:
-        logging.warn(
-            'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
-            .format(unique_name_size, tracer_var_size, alive_cpp_var_size))
-        objgraph.show_growth(limit=limit)
-    else:
-        return unique_name_size, tracer_var_size, alive_cpp_var_size
-
-
 @framework.dygraph_only
 def grad(outputs,
          inputs,
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 9876fc620b870f47b10e9f99e4de34f5cb81fde1..93cb0bafc847b897816636f92255bd06b7e67321 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -195,58 +195,11 @@ def load_dygraph(model_path, config=None):
     params_file_path = model_prefix + ".pdparams"
     opti_file_path = model_prefix + ".pdopt"
 
-    # deal with argument `configs`
-    configs = config
-    if configs is None:
-        configs = SaveLoadConfig()
-
-    if not os.path.exists(params_file_path) and not os.path.exists(
-            opti_file_path):
-        # Load state dict by `jit.save/io.save_inference_model` save format
-        # NOTE(chenweihang): [ Compatibility of save_inference_model save format ]
-        # The model saved by `save_inference_model` does not completely correspond to 
-        # the information required by the `state_dict` under the dygraph. 
-        # `save_inference_model` not save structured name, we need to remind 
-        # the user to configure the `use_structured_name` argument when `set_state_dict`
-        # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
-
-        # 1. check model path
-        if not os.path.isdir(model_prefix):
-            raise ValueError("Model saved directory '%s' is not exists." %
-                             model_prefix)
+    # deal with argument `config`
+    if config is None:
+        config = SaveLoadConfig()
 
-        # 2. load program desc & construct _ProgramHolder
-        programs = _construct_program_holders(model_path,
-                                              configs.model_filename)
-
-        # 3. load layer parameters & buffers
-        # NOTE: using fluid.dygraph.guard() here will cause import error in py2
-        with guard():
-            persistable_var_dict = _construct_params_and_buffers(
-                model_prefix,
-                programs,
-                configs.separate_params,
-                configs.params_filename,
-                append_suffix=False)
-
-            # 4. construct state_dict
-            para_dict = dict()
-            for var_name in persistable_var_dict:
-                para_dict[var_name] = persistable_var_dict[var_name].numpy()
-
-            # if __variables.info__ exists, we can recover structured_name
-            var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
-            if os.path.exists(var_info_path):
-                with open(var_info_path, 'rb') as f:
-                    extra_var_info = pickle.load(f)
-                structured_para_dict = dict()
-                for var_name in para_dict:
-                    structured_name = extra_var_info[var_name].get(
-                        'structured_name', None)
-                    assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
-                    structured_para_dict[structured_name] = para_dict[var_name]
-                para_dict = structured_para_dict
-    else:
+    if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
         # Load state dict by `save_dygraph` save format
         para_dict = {}
         if os.path.exists(params_file_path):
@@ -254,12 +207,103 @@ def load_dygraph(model_path, config=None):
                 para_dict = pickle.load(f) if six.PY2 else pickle.load(
                     f, encoding='latin1')
 
-        if not configs.keep_name_table and "StructuredToParameterName@@" in para_dict:
+        if not config.keep_name_table and "StructuredToParameterName@@" in para_dict:
             del para_dict["StructuredToParameterName@@"]
 
         if os.path.exists(opti_file_path):
             with open(opti_file_path, 'rb') as f:
                 opti_dict = pickle.load(f) if six.PY2 else pickle.load(
                     f, encoding='latin1')
+    else:
+        # check model path
+        if not os.path.isdir(model_prefix):
+            raise ValueError("Model saved directory '%s' is not exists." %
+                             model_prefix)
+
+        # check whether model file exists
+        if config.model_filename is None:
+            model_filename = '__model__'
+        else:
+            model_filename = config.model_filename
+        model_file_path = os.path.join(model_path, model_filename)
+
+        if os.path.exists(model_file_path):
+            # Load state dict by `jit.save/io.save_inference_model` save format
+            # NOTE(chenweihang): [ Compatibility of save_inference_model save format ]
+            # The model saved by `save_inference_model` does not completely correspond to 
+            # the information required by the `state_dict` under the dygraph. 
+            # `save_inference_model` not save structured name, we need to remind 
+            # the user to configure the `use_structured_name` argument when `set_state_dict`
+            # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
+
+            # 1. load program desc & construct _ProgramHolder
+            programs = _construct_program_holders(model_path,
+                                                  config.model_filename)
+
+            # 2. load layer parameters & buffers
+            # NOTE: using fluid.dygraph.guard() here will cause import error in py2
+            with guard():
+                persistable_var_dict = _construct_params_and_buffers(
+                    model_prefix,
+                    programs,
+                    config.separate_params,
+                    config.params_filename,
+                    append_suffix=False)
+
+                # 3. construct state_dict
+                para_dict = dict()
+                for var_name in persistable_var_dict:
+                    para_dict[var_name] = persistable_var_dict[var_name].numpy()
+
+                # if __variables.info__ exists, we can recover structured_name
+                var_info_path = os.path.join(model_prefix,
+                                             EXTRA_VAR_INFO_FILENAME)
+                if os.path.exists(var_info_path):
+                    with open(var_info_path, 'rb') as f:
+                        extra_var_info = pickle.load(f)
+                    structured_para_dict = dict()
+                    for var_name in para_dict:
+                        structured_name = extra_var_info[var_name].get(
+                            'structured_name', None)
+                        assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
+                        structured_para_dict[structured_name] = para_dict[
+                            var_name]
+                    para_dict = structured_para_dict
+        else:
+            # load state dict by `io.save_params/persistables` save format
+            # TODO(chenweihang): [ Now only supports loading parameters seperately ]
+            # If users save all parameters as one file, the [ variable.name -> variable ]
+            # mapping info will lost, so users need to give variable list, but users build 
+            # variable list in dygraph mode is difficult, we recommend users to use
+            # paddle.io.load_program_state in this case
+
+            # Try to load all the files in the directory in VarBase format, 
+            # the file name is used as the name of VarBase
+            load_var_list = []
+
+            # 1. load file names
+            var_name_list = []
+            for root, _, files in os.walk(model_path):
+                for filename in files:
+                    file_path = os.path.join(root, filename)
+                    tmp_var_name = os.path.relpath(file_path, model_path)
+                    var_name = tmp_var_name.replace("\\", "/")
+                    var_name_list.append(var_name)
+
+            # 2. create and load VarBase
+            with guard():
+                for name in var_name_list:
+                    new_var = _varbase_creator(name=name, persistable=True)
+                    _dygraph_tracer().trace_op(
+                        type='load',
+                        inputs={},
+                        outputs={'Out': new_var},
+                        attrs={'file_path': os.path.join(model_path, name)})
+                    load_var_list.append(new_var)
+
+            # 3. construct state_dict
+            para_dict = dict()
+            for var in load_var_list:
+                para_dict[var.name] = var.numpy()
 
     return para_dict, opti_dict
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index 5152799ca72f1461d6fbfc3a619a6aa9b9477934..5050067e48a1b147d43abd955c64c7fbb8cf6068 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -60,7 +60,7 @@ class DygraphToStaticAst(gast.NodeTransformer):
     def transfer_from_node_type(self, node_wrapper):
         translator_logger = logging_utils.TranslatorLogger()
         translator_logger.log(
-            1, "   Source code: \n{}".format(ast_to_source_code(self.root)))
+            1, "Source code: \n{}".format(ast_to_source_code(self.root)))
         # Generic transformation
         self.visit(node_wrapper.node)
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 37ce8b0a152ff8e258e8aee2a54ed7215f77c146..3d1ed836ff1aca38d796f3a247e9a5d6f6cf3add 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -12,17 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import six
 import inspect
 import numpy as np
 import collections
+
 import paddle
 from paddle.fluid import core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
 from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
@@ -291,7 +292,7 @@ def convert_to_input_spec(inputs, input_spec):
         if len(inputs) > len(input_spec):
             for rest_input in inputs[len(input_spec):]:
                 if isinstance(rest_input, (core.VarBase, np.ndarray)):
-                    logging.warning(
+                    logging_utils.warn(
                         "The inputs constain `{}` without specificing InputSpec, its shape and dtype will be treated immutable. "
                         "Please specific InputSpec information in `@declarative` if you expect them as mutable inputs.".
                         format(type_name(rest_input)))
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
index c52872b15016169504359b54ad5a40360e244ce0..4d9ed5916adfd79013be1d8d1bb90f3c44428b49 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -26,6 +26,8 @@ CODE_LEVEL_ENV_NAME = 'TRANSLATOR_CODE_LEVEL'
 DEFAULT_VERBOSITY = -1
 DEFAULT_CODE_LEVEL = -1
 
+LOG_AllTransformer = 100
+
 
 def synchronized(func):
     def wrapper(*args, **kwargs):
@@ -53,10 +55,15 @@ class TranslatorLogger(object):
             return
 
         self._initialized = True
+        self.logger_name = "Dynamic-to-Static"
         self._logger = log_helper.get_logger(
-            __name__, 1, fmt='%(asctime)s-%(levelname)s: %(message)s')
+            self.logger_name,
+            1,
+            fmt='%(asctime)s %(name)s %(levelname)s: %(message)s')
         self._verbosity_level = None
         self._transformed_code_level = None
+        self._need_to_echo_log_to_stdout = None
+        self._need_to_echo_code_to_stdout = None
 
     @property
     def logger(self):
@@ -86,6 +93,28 @@ class TranslatorLogger(object):
         self.check_level(level)
         self._transformed_code_level = level
 
+    @property
+    def need_to_echo_log_to_stdout(self):
+        if self._need_to_echo_log_to_stdout is not None:
+            return self._need_to_echo_log_to_stdout
+        return False
+
+    @need_to_echo_log_to_stdout.setter
+    def need_to_echo_log_to_stdout(self, log_to_stdout):
+        assert isinstance(log_to_stdout, (bool, type(None)))
+        self._need_to_echo_log_to_stdout = log_to_stdout
+
+    @property
+    def need_to_echo_code_to_stdout(self):
+        if self._need_to_echo_code_to_stdout is not None:
+            return self._need_to_echo_code_to_stdout
+        return False
+
+    @need_to_echo_code_to_stdout.setter
+    def need_to_echo_code_to_stdout(self, code_to_stdout):
+        assert isinstance(code_to_stdout, (bool, type(None)))
+        self._need_to_echo_code_to_stdout = code_to_stdout
+
     def check_level(self, level):
         if isinstance(level, (six.integer_types, type(None))):
             rv = level
@@ -110,34 +139,56 @@ class TranslatorLogger(object):
 
     def error(self, msg, *args, **kwargs):
         self.logger.error(msg, *args, **kwargs)
+        if self.need_to_echo_log_to_stdout:
+            self._output_to_stdout('ERROR: ' + msg, *args)
 
     def warn(self, msg, *args, **kwargs):
-        self.logger.warn(msg, *args, **kwargs)
+        self.logger.warning(msg, *args, **kwargs)
+        if self.need_to_echo_log_to_stdout:
+            self._output_to_stdout('WARNING: ' + msg, *args)
 
     def log(self, level, msg, *args, **kwargs):
         if self.has_verbosity(level):
-            self.logger.log(level, msg, *args, **kwargs)
+            msg_with_level = '(Level {}) {}'.format(level, msg)
+            self.logger.info(msg_with_level, *args, **kwargs)
+            if self.need_to_echo_log_to_stdout:
+                self._output_to_stdout('INFO: ' + msg_with_level, *args)
 
     def log_transformed_code(self, level, ast_node, transformer_name, *args,
                              **kwargs):
         if self.has_code_level(level):
             source_code = ast_to_source_code(ast_node)
-            header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
-                .format(level, transformer_name)
+            if level == LOG_AllTransformer:
+                header_msg = "After the last level ast transformer: '{}', the transformed code:\n" \
+                    .format(transformer_name)
+            else:
+                header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
+                    .format(level, transformer_name)
 
             msg = header_msg + source_code
             self.logger.info(msg, *args, **kwargs)
 
+            if self.need_to_echo_code_to_stdout:
+                self._output_to_stdout('INFO: ' + msg, *args)
+
+    def _output_to_stdout(self, msg, *args):
+        msg = self.logger_name + ' ' + msg
+        print(msg % args)
+
 
 _TRANSLATOR_LOGGER = TranslatorLogger()
 
 
-def set_verbosity(level=0):
+def set_verbosity(level=0, also_to_stdout=False):
     """
-    Sets the verbosity level of log for dygraph to static graph.
+    Sets the verbosity level of log for dygraph to static graph. Logs can be output to stdout by setting `also_to_stdout`.
+
     There are two means to set the logging verbosity:
-     1. Call function `set_verbosity`
-     2. Set environment variable `TRANSLATOR_VERBOSITY`
+
+    1. Call function `set_verbosity`
+
+    2. Set environment variable `TRANSLATOR_VERBOSITY`
+
 
     **Note**:
     `set_verbosity` has a higher priority than the environment variable.
@@ -145,6 +196,7 @@ def set_verbosity(level=0):
     Args:
         level(int): The verbosity level. The larger value idicates more verbosity.
             The default value is 0, which means no logging.
+        also_to_stdout(bool): Whether to also output log messages to `sys.stdout`.
 
     Examples:
         .. code-block:: python
@@ -159,27 +211,30 @@ def set_verbosity(level=0):
             # The verbosity level is now 3, but it has no effect because it has a lower priority than `set_verbosity`
     """
     _TRANSLATOR_LOGGER.verbosity_level = level
+    _TRANSLATOR_LOGGER.need_to_echo_log_to_stdout = also_to_stdout
 
 
 def get_verbosity():
     return _TRANSLATOR_LOGGER.verbosity_level
 
 
-LOG_AllTransformer = 100
-
-
-def set_code_level(level=LOG_AllTransformer):
+def set_code_level(level=LOG_AllTransformer, also_to_stdout=False):
     """
-    Sets the level to print code from specific level of Ast Transformer.
+    Sets the level to print code from specific level Ast Transformer. Code can be output to stdout by setting `also_to_stdout`.
+
     There are two means to set the code level:
-     1. Call function `set_code_level`
-     2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+
+    1. Call function `set_code_level`
+
+    2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+
 
     **Note**:
     `set_code_level` has a higher priority than the environment variable.
 
     Args:
         level(int): The level to print code. Default is 100, which means to print the code after all AST Transformers.
+        also_to_stdout(bool): Whether to also output code to `sys.stdout`.
 
     Examples:
         .. code-block:: python
@@ -195,6 +250,7 @@ def set_code_level(level=LOG_AllTransformer):
 
     """
     _TRANSLATOR_LOGGER.transformed_code_level = level
+    _TRANSLATOR_LOGGER.need_to_echo_code_to_stdout = also_to_stdout
 
 
 def get_code_level():
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 59cb5fb144eb50f4616c94ed78348d56a4029834..1004665ca15fbc2458c1626735f161c7f4904596 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -14,21 +14,17 @@
 
 from __future__ import print_function
 import numpy as np
-import logging
 import six
 
-from paddle.fluid import log_helper
 from paddle.fluid import framework, backward, core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
 import paddle.compat as cpt
 
-_logger = log_helper.get_logger(
-    __name__, logging.WARNING, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
 
 class NestSequence(object):
     """
@@ -72,7 +68,7 @@ class NestSequence(object):
                 if not isinstance(var, (framework.Variable, core.VarBase)):
                     warning_types.add(type(var))
             if warning_types:
-                _logger.warning(
+                logging_utils.warn(
                     "Output of traced function contains non-tensor type values: {}. "
                     "Currently, We don't support to update them while training and will return "
                     "what we first saw. Please try to return them as tensor.".
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
index d555c8ed28f358a43e53966dd30d76d85a03dde5..efde2481721f4f16f0ecadb1200b4abcde73e2b8 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -15,14 +15,8 @@
 from __future__ import print_function
 
 import gast
-import logging
 
-from paddle.fluid import log_helper
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, NodeVarType, StaticAnalysisVisitor
-from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
-
-_logger = log_helper.get_logger(
-    __name__, logging.WARNING, fmt='%(asctime)s-%(levelname)s: %(message)s')
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, StaticAnalysisVisitor
 
 
 class PrintTransformer(gast.NodeTransformer):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index e5fce3e6ede1511458f8da916165738d9e842d1a..5218c0aac957422a665513b5eb2a0391c5c7a01f 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -13,17 +13,15 @@
 # limitations under the License.
 
 from __future__ import print_function
-import gast
+
 import collections
-import logging
+import gast
 import inspect
 import six
 import textwrap
 import threading
-import warnings
 import weakref
 
-import gast
 from paddle.fluid import framework
 from paddle.fluid import in_dygraph_mode
 from paddle.fluid.dygraph import layers
@@ -246,7 +244,7 @@ class StaticLayer(object):
         self._function_spec = FunctionSpec(function, input_spec)
         self._program_cache = ProgramCache()
         self._descriptor_cache = weakref.WeakKeyDictionary()
-        # Note: Hold a reference to ProgramTranslator for switching `enable_declarative`.
+        # Note: Hold a reference to ProgramTranslator for switching `enable_to_static`.
         self._program_trans = ProgramTranslator()
 
     def __get__(self, instance, owner):
@@ -299,16 +297,17 @@ class StaticLayer(object):
         """
 
         # 1. call dygraph function directly if not enable `declarative`
-        if not self._program_trans.enable_declarative:
+        if not self._program_trans.enable_to_static:
             logging_utils.warn(
-                "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable=False. "
-                "We will just return dygraph output.")
+                "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable to False. "
+                "We will just return dygraph output. If you would like to get static graph output, please call API "
+                "ProgramTranslator.enable(True)")
             return self._call_dygraph_function(*args, **kwargs)
 
-        if not in_dygraph_mode() and self._program_trans.enable_declarative:
+        if not in_dygraph_mode():
             raise RuntimeError(
                 "Failed to run the callable object {} decorated by '@paddle.jit.to_static', "
-                "because it does NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
+                "because it is NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
                 "following API: paddle.disable_static().".format(
                     self.dygraph_function))
 
@@ -450,7 +449,7 @@ class StaticLayer(object):
                     format(self._function_spec))
         # If more than one programs have been cached, return the recent converted program by default.
         elif cached_program_len > 1:
-            logging.warning(
+            logging_utils.warn(
                 "Current {} has more than one cached programs: {}, the last traced progam will be return by default.".
                 format(self._function_spec, cached_program_len))
 
@@ -631,7 +630,7 @@ class ProgramCache(object):
             # Note: raise warnings if number of traced program is more than `max_tracing_count`
             current_tracing_count = len(self._caches)
             if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
-                logging.warning(
+                logging_utils.warn(
                     "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
                     "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".
                     format(current_tracing_count, MAX_TRACED_PROGRAM_COUNT))
@@ -723,15 +722,15 @@ class ProgramTranslator(object):
             return
         self._initialized = True
         self._program_cache = ProgramCache()
-        self.enable_declarative = True
+        self.enable_to_static = True
 
-    def enable(self, enable_declarative):
+    def enable(self, enable_to_static):
         """
         Enable or disable the converting from imperative to declarative by
         ProgramTranslator globally.
 
         Args:
-            enable_declarative (bool): True or False to enable or disable declarative.
+            enable_to_static (bool): True or False to enable or disable declarative.
 
         Returns:
             None.
@@ -760,9 +759,9 @@ class ProgramTranslator(object):
                 print(func(x).numpy()) # [[2. 2.]]
 
         """
-        check_type(enable_declarative, "enable_declarative", bool,
+        check_type(enable_to_static, "enable_to_static", bool,
                    "ProgramTranslator.enable")
-        self.enable_declarative = enable_declarative
+        self.enable_to_static = enable_to_static
 
     def get_output(self, dygraph_func, *args, **kwargs):
         """
@@ -803,10 +802,13 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_output"
-        if not self.enable_declarative:
-            warnings.warn(
-                "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable = False. "
-                "We will just return dygraph output.")
+
+        if not self.enable_to_static:
+            logging_utils.warn(
+                "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable to False. "
+                "We will just return dygraph output. "
+                "Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
             return dygraph_func(*args, **kwargs)
         try:
             function_spec = FunctionSpec(dygraph_func)
@@ -876,10 +878,12 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_func"
-        if not self.enable_declarative:
-            warnings.warn(
-                "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable=False. We will "
-                "just return dygraph output.")
+
+        if not self.enable_to_static:
+            logging_utils.warn(
+                "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable to False. We will "
+                "just return dygraph output. Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
             return dygraph_func
 
         static_func = convert_to_static(dygraph_func)
@@ -929,10 +933,13 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_program"
-        if not self.enable_declarative:
-            warnings.warn(
-                "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable=False."
-                "We will just return dygraph output.")
+
+        if not self.enable_to_static:
+            logging_utils.warn(
+                "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable to False."
+                "We will just return dygraph output. "
+                "Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
             return dygraph_func(*args, **kwargs)
 
         function_spec = FunctionSpec(dygraph_func)
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 57864efec8a9447cca0be94f0f1b433c18435376..10819e4b320dd0630c7ac43fdf89b84252823a94 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -26,6 +26,7 @@ from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticLayer, unwrap_decorators
 from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
@@ -119,8 +120,8 @@ def _dygraph_to_static_func_(dygraph_func):
     # TODO: remove this decorator after we finalize training API
     def __impl__(*args, **kwargs):
         program_translator = ProgramTranslator()
-        if in_dygraph_mode() or not program_translator.enable_declarative:
-            warnings.warn(
+        if in_dygraph_mode() or not program_translator.enable_to_static:
+            logging_utils.warn(
                 "The decorator 'dygraph_to_static_func' doesn't work in "
                 "dygraph mode or set ProgramTranslator.enable to False. "
                 "We will just return dygraph output.")
@@ -215,7 +216,7 @@ def declarative(function=None, input_spec=None):
         if isinstance(function, Layer):
             if isinstance(function.forward, StaticLayer):
                 class_name = function.__class__.__name__
-                warnings.warn(
+                logging_utils.warn(
                     "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".
                     format(class_name))
             function.forward = decorated(function.forward)
@@ -832,9 +833,9 @@ def save(layer, model_path, input_spec=None, config=None):
 
     # 1. input check
     prog_translator = ProgramTranslator()
-    if not prog_translator.enable:
+    if not prog_translator.enable_to_static:
         raise RuntimeError(
-            "The paddle.jit.save doesn't work when setting ProgramTranslator.enable=False."
+            "The paddle.jit.save doesn't work when setting ProgramTranslator.enable to False."
         )
     if not isinstance(layer, Layer):
         raise TypeError(
diff --git a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
index ad51a043a0a50f89f77811adb7f95759a4f220be..a8c1656b2b03758043aec9058db317e52b45a5af 100644
--- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
@@ -98,7 +98,7 @@ class AutoCheckpointChecker(object):
             self._fs_cache = os.getenv("PADDLE_EDL_FS_CACHE", ".cache")
 
             self._save_checkpoint_inter = int(
-                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  #s
+                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  # s
 
             if not self._ce_test:
                 assert len(self._hdfs_home) > 3 and \
@@ -132,7 +132,7 @@ class AutoCheckpointChecker(object):
         if in_dygraph_mode():
             return False
 
-        return  self._run_env is not None and \
+        return self._run_env is not None and \
             self._platform is not None and \
             self._job_id is not None and \
             self._hdfs_home is not None and \
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index cb1a54ef19899059d1a46d0807ce58bf3b5ab8b5..58313c46c3cf0d42d6e14e10d0ca91f361ce787a 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -26,8 +26,7 @@ import paddle.fluid as fluid
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet as fleet_pslib
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet as fleet_transpiler
-from . import hdfs
-from .hdfs import *
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 from . import utils
 
 __all__ = ["FleetUtil"]
diff --git a/python/paddle/fluid/incubate/fleet/utils/fs.py b/python/paddle/fluid/incubate/fleet/utils/fs.py
deleted file mode 100644
index 0ba06ef934a525d3801e233c6e2f124fb0a6df52..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/utils/fs.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import subprocess
-import multiprocessing
-from datetime import datetime
-
-import re
-import copy
-import errno
-import time
-import logging
-import abc
-from pathlib import PurePosixPath, Path
-import shutil
-
-__all__ = ['FS', 'LocalFS']
-
-
-class ExecuteError(Exception):
-    pass
-
-
-class FSFileExistsError(Exception):
-    pass
-
-
-class FSFileNotExistsError(Exception):
-    pass
-
-
-class FSTimeOut(Exception):
-    pass
-
-
-class FSShellCmdAborted(ExecuteError):
-    pass
-
-
-class FS(object):
-    @abc.abstractmethod
-    def ls_dir(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def is_file(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def is_dir(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def is_exist(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def upload(self, local_path, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def download(self, fs_path, local_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def mkdirs(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def delete(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def need_upload_download(self):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def rename(self, fs_src_path, fs_dst_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=False):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def upload_dir(self, local_dir, dest_dir):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def list_dirs(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def touch(self, fs_path, exist_ok=True):
-        raise NotImplementedError
-
-
-class LocalFS(FS):
-    def ls_dir(self, fs_path):
-        return [f for f in os.listdir(fs_path)]
-
-    def mkdirs(self, fs_path):
-        assert not os.path.isfile(fs_path), "{} is already a file".format(
-            fs_path)
-        os.system("mkdir -p {}".format(fs_path))
-
-    def rename(self, fs_src_path, fs_dst_path):
-        os.rename(fs_src_path, fs_dst_path)
-
-    def _rmr(self, fs_path):
-        shutil.rmtree(fs_path)
-
-    def _rm(self, fs_path):
-        os.remove(fs_path)
-
-    def delete(self, fs_path):
-        if not self.is_exist(fs_path):
-            return
-
-        if os.path.isfile(fs_path):
-            return self._rm(fs_path)
-
-        return self._rmr(fs_path)
-
-    def need_upload_download(self):
-        return False
-
-    def is_file(self, fs_path):
-        return os.path.isfile(fs_path)
-
-    def is_dir(self, fs_path):
-        return os.path.isdir(fs_path)
-
-    def is_exist(self, fs_path):
-        return os.path.exists(fs_path)
-
-    def touch(self, fs_path, exist_ok=True):
-        if self.is_exist(fs_path):
-            if exist_ok:
-                return
-            raise FSFileExistsError
-
-        return Path(fs_path).touch(exist_ok=True)
-
-    def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
-        if not self.is_exist(src_path):
-            raise FSFileNotExistsError
-
-        if overwrite and self.is_exist(dst_path):
-            self.delete(dst_path)
-
-        if self.is_exist(dst_path):
-            raise FSFileExistsError
-
-        return self.rename(src_path, dst_path)
-
-    def list_dirs(self, fs_path):
-        """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
-        """
-        if not self.is_exist(fs_path):
-            return []
-
-        dirs = [
-            f for f in os.listdir(fs_path) if os.path.isdir(fs_path + "/" + f)
-        ]
-
-        return dirs
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bc9f182d95e3b728fbc0866e1c79f5508d3a04aa..4a750f301a02c1a8f90e4103103c174baf32ead9 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -11229,7 +11229,7 @@ def shape(input):
                 input.shape = [3, 2]
 
     Args:
-        input (Variable): The input can be N-D Tensor or SelectedRows with data type float16, float32, float64, int32, int64.
+        input (Variable): The input can be N-D Tensor or SelectedRows with data type bool, float16, float32, float64, int32, int64.
                           If input variable is type of SelectedRows, returns the shape of it's inner tensor.
 
     Returns:
@@ -11253,8 +11253,8 @@ def shape(input):
             print(res) # [array([  3, 100, 100], dtype=int32)]
     """
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'shape')
+        input, 'input',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'shape')
     helper = LayerHelper('shape', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
     helper.append_op(
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 192effd2e42dc937fbf47efdd1d772a4c078f888..1e7915ed781a6441f32fb86c3c92e6f68ca66b93 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -3570,8 +3570,10 @@ class ExponentialMovingAverage(object):
                 # bias correction
                 with layers.control_flow.Switch() as switch:
                     with switch.case(global_step > 0):
-                        layers.assign(output=ema, input=ema / (1.0 - decay_pow))
-                layers.assign(input=ema, output=param)
+                        layers.assign(
+                            output=param, input=ema / (1.0 - decay_pow))
+                    with switch.default():
+                        layers.assign(output=param, input=ema)
 
         self.restore_program = Program()
         block = self.restore_program.global_block()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index fa092ffb191601192fa5fed050b1e1f995896058..8d236dca22f2266771a029b7cdbf7db21aefb1fe 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -4,6 +4,7 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FL
 set(dist_ENVS http_proxy="" https_proxy="")
 
 file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
+list(REMOVE_ITEM DIST_TEST_OPS "test_dist_op")
 if(NOT WITH_NCCL)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
 endif()
@@ -102,7 +103,6 @@ if(WIN32)
 endif()
 
 
-LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_new)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint1)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
@@ -326,7 +326,6 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
-list(REMOVE_ITEM TEST_OPS test_imperative_debug_string)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_mnist)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while)
@@ -416,7 +415,6 @@ py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_
 py_test_modules(test_install_check MODULES test_install_check ENVS
         FLAGS_cudnn_deterministic=1 SERIAL)
 set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
-py_test_modules(test_imperative_debug_string MODULES test_imperative_debug_string ENVS FLAGS_dygraph_debug=1)
 py_test_modules(test_imperative_static_runner_mnist MODULES test_imperative_static_runner_mnist ENVS
     FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_static_runner_while MODULES test_imperative_static_runner_while ENVS
@@ -465,8 +463,8 @@ if(WITH_DISTRIBUTE)
 	   #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
         if(NOT WIN32)
             py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
-            #py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
-            #py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
         endif(NOT WIN32)
     endif(NOT APPLE)
     if(WITH_DGC)
@@ -560,7 +558,7 @@ endif()
 set_tests_properties(test_parallel_executor_test_while_train test_parallel_executor_mnist
         test_parallel_executor_feed_persistable_var
         test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
-        test_data_norm_op test_imperative_using_non_zero_gpu
+        test_data_norm_op
         test_dataloader_keep_order
         test_dataloader_unkeep_order
         test_parallel_executor_fetch_isolated_var
diff --git a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
index 529ff4ec45d1fdc6d1d8e765e38cff53d36aade7..2464882d617effb838c1f40d40ec2d89c13e73d2 100644
--- a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
+++ b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
@@ -20,8 +20,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
 from paddle.fluid.framework import program_guard
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
index 510b615654751500c33dc3311353ba7e2f8baf40..b8a18179742df108d44dbf527adba819eefe91cd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -56,8 +56,30 @@ class TestLoggingUtils(unittest.TestCase):
         with self.assertRaises(TypeError):
             paddle.jit.set_verbosity(3.3)
 
-    def test_code_level(self):
+    def test_also_to_stdout(self):
+        logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout = None
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout, False)
 
+        paddle.jit.set_verbosity(also_to_stdout=False)
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout, False)
+
+        logging_utils._TRANSLATOR_LOGGER.need_to_echo_node_to_stdout = None
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_code_to_stdout, False)
+
+        paddle.jit.set_code_level(also_to_stdout=True)
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_code_to_stdout, True)
+
+        with self.assertRaises(AssertionError):
+            paddle.jit.set_verbosity(also_to_stdout=1)
+
+        with self.assertRaises(AssertionError):
+            paddle.jit.set_code_level(also_to_stdout=1)
+
+    def test_set_code_level(self):
         paddle.jit.set_code_level(None)
         os.environ[logging_utils.CODE_LEVEL_ENV_NAME] = '2'
         self.assertEqual(logging_utils.get_code_level(), 2)
@@ -71,7 +93,25 @@ class TestLoggingUtils(unittest.TestCase):
         with self.assertRaises(TypeError):
             paddle.jit.set_code_level(3.3)
 
-    def test_log(self):
+    def test_log_api(self):
+        # test api for CI Converage
+        logging_utils.set_verbosity(1, True)
+
+        logging_utils.warn("warn")
+        logging_utils.error("error")
+
+        logging_utils.log(1, "log level 1")
+        logging_utils.log(2, "log level 2")
+
+        source_code = "x = 3"
+        ast_code = gast.parse(source_code)
+        logging_utils.set_code_level(1, True)
+        logging_utils.log_transformed_code(1, ast_code, "TestTransformer")
+        logging_utils.set_code_level(logging_utils.LOG_AllTransformer, True)
+        logging_utils.log_transformed_code(logging_utils.LOG_AllTransformer,
+                                           ast_code, "TestTransformer")
+
+    def test_log_message(self):
         stream = io.BytesIO() if six.PY2 else io.StringIO()
         log = self.translator_logger.logger
         stdout_handler = logging.StreamHandler(stream)
@@ -84,13 +124,14 @@ class TestLoggingUtils(unittest.TestCase):
 
         if six.PY3:
             with mock.patch.object(sys, 'stdout', stream):
+                logging_utils.set_verbosity(1, False)
                 logging_utils.warn(warn_msg)
                 logging_utils.error(error_msg)
-                self.translator_logger.verbosity_level = 1
                 logging_utils.log(1, log_msg_1)
                 logging_utils.log(2, log_msg_2)
 
-            result_msg = '\n'.join([warn_msg, error_msg, log_msg_1, ""])
+            result_msg = '\n'.join(
+                [warn_msg, error_msg, "(Level 1) " + log_msg_1, ""])
             self.assertEqual(result_msg, stream.getvalue())
 
     def test_log_transformed_code(self):
diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
index 6a752bc3053d7d0672bd0002250252c3bbbfa1e1..766dcc39af1a55614b0c179a686dbaab5af1e349 100644
--- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -19,7 +19,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eb397b5a95b240dcaff9dee3758646b35ab5022
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+"""Test for fusion of conv and bias."""
+
+
+#padding SAME
+class ConvBiasMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="SAME",
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#padding VALID
+class ConvBiasMkldnnFusePassTest1(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#padding number
+class ConvBiasMkldnnFusePassTest2(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding=[2, 4, 6, 8],
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#dilation not supported yet, just print warning log and does not fuse
+class ConvBiasMkldnnFusePassTest3(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                dilation=2,
+                groups=3,
+                bias_attr=param_attr,
+                use_cudnn=False,
+                act="softmax",
+                data_format="NCHW")
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#all conv params except for dilation
+class ConvBiasMkldnnFusePassTest4(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                groups=3,
+                bias_attr=param_attr,
+                use_cudnn=False,
+                act="softmax",
+                data_format="NCHW")
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
index 90db9595d92ef602c03fa7dd104484a4f6101a87..3c78438bdf68538da598f19270d8812e1286474d 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
@@ -67,13 +67,13 @@ class AutoCheckpointTestDist(AutoCheckPointACLBase):
         save_dir = "./run_save_0"
         fs.delete(save_dir)
 
-        #basic
+        # basic
         exe, main_prog, startup_prog = self._generate()
 
         compiled, data_loader, optimizer, loss, image, label = \
             self._init_env(exe, main_prog, startup_prog, minimize=False)
 
-        #fleet
+        # fleet
         os.environ["TRAINING_ROLE"] = "TRAINER"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py b/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
index ad75f2aa8bc06d2af24f5d22eb126a3558cd6f74..4c1b1e0f0bf9034263d2c36747cdae301da24215 100644
--- a/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
@@ -21,8 +21,7 @@ from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 4982cd195820811b9a8ec3fe6d01955234032120..c6190590108876ba97feb4dad0c31884727ec978 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -26,7 +26,7 @@ def stable_softmax(x):
     return exps / np.sum(exps)
 
 
-def log_softmax(x, axis=-1):
+def log_softmax(x, axis=1):
     softmax_out = np.apply_along_axis(stable_softmax, axis, x)
     return np.log(softmax_out)
 
diff --git a/python/paddle/fluid/tests/unittests/test_device_guard.py b/python/paddle/fluid/tests/unittests/test_device_guard.py
index eb8861f1bc462e61d4bd685fca39452bfd8a8c4e..330065ecd92f1537e10df7b993682da821a46548 100644
--- a/python/paddle/fluid/tests/unittests/test_device_guard.py
+++ b/python/paddle/fluid/tests/unittests/test_device_guard.py
@@ -33,6 +33,14 @@ def execute(main_program, startup_program):
     exe.run(main_program)
 
 
+def get_vaild_warning_num(warning, w):
+    num = 0
+    for i in range(len(w)):
+        if warning in str(w[i].message):
+            num += 1
+    return num
+
+
 class TestDeviceGuard(unittest.TestCase):
     def test_device_guard(self):
         main_program = fluid.Program()
@@ -133,7 +141,10 @@ class TestDeviceGuard(unittest.TestCase):
                         i = fluid.layers.increment(x=i, value=1, in_place=True)
                         fluid.layers.less_than(x=i, y=loop_len, cond=cond)
 
-        assert len(w) == 1
+        warning = "The Op(while) is not support to set device."
+        warning_num = get_vaild_warning_num(warning, w)
+        assert warning_num == 1
+
         all_ops = main_program.global_block().ops
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         for op in all_ops:
@@ -169,7 +180,10 @@ class TestDeviceGuard(unittest.TestCase):
                         shape=[1], value=4.0, dtype='float32')
                     result = fluid.layers.less_than(x=x, y=y, force_cpu=False)
 
-        assert len(w) == 2
+        warning = "\'device_guard\' has higher priority when they are used at the same time."
+        warning_num = get_vaild_warning_num(warning, w)
+        assert warning_num == 2
+
         all_ops = main_program.global_block().ops
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         for op in all_ops:
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
index f339081e31b87b8d5584fd4f866e0aaf6f391ea7..007affc14084956870e50e0e1b6b650d13cc3926 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -67,6 +67,13 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp):
         self.out = np.floor_divide(self.x, self.y)
 
 
+class TestElementwiseModOpInverse(TestElementwiseModOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
 class TestFloorDivideOp(unittest.TestCase):
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_empty_like_op.py b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..32d732d9a809950ade5484431b833056336acd54
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
@@ -0,0 +1,192 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.data_feeder import convert_dtype
+import paddle.fluid.core as core
+from paddle.static import program_guard, Program
+
+
+class TestEmptyLikeAPICommon(unittest.TestCase):
+    def __check_out__(self, out):
+        data_type = convert_dtype(out.dtype)
+        self.assertEqual(data_type, self.dst_dtype,
+                         'dtype should be %s, but get %s' %
+                         (self.dst_dtype, data_type))
+
+        shape = out.shape
+        self.assertTupleEqual(shape, self.dst_shape,
+                              'shape should be %s, but get %s' %
+                              (self.dst_shape, shape))
+
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(out)
+            min_value = np.nanmin(out)
+            always_non_full_zero = max_value > min_value
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = out.size
+            true_num = np.sum(out == True)
+            false_num = np.sum(out == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+
+class TestEmptyLikeAPI(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        out = paddle.empty_like(self.x, self.dtype)
+        self.__check_out__(out.numpy())
+        paddle.enable_static()
+
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float32")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI2(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float64")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI3(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI4(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int64")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI5(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("bool")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI6(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float64")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI7(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI8(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int64")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI9(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("bool")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI10(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float32")
+        self.dtype = "bool"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI_Static(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def test_static_graph(self):
+        dtype = 'float32'
+
+        train_program = Program()
+        startup_program = Program()
+
+        with program_guard(train_program, startup_program):
+            x = np.random.random(self.x_shape).astype(dtype)
+            data_x = paddle.static.data(
+                'x', shape=self.data_x_shape, dtype=dtype)
+
+            out = paddle.empty_like(data_x)
+
+        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        res = exe.run(train_program, feed={'x': x}, fetch_list=[out])
+
+        self.dst_dtype = dtype
+        self.dst_shape = x.shape
+        self.__check_out__(res[0])
+
+    def init_config(self):
+        self.x_shape = (200, 3)
+        self.data_x_shape = [200, 3]
+
+
+class TestEmptyLikeAPI_Static2(TestEmptyLikeAPI_Static):
+    def init_config(self):
+        self.x_shape = (3, 200, 3)
+        self.data_x_shape = [-1, 200, 3]
+
+
+class TestEmptyError(unittest.TestCase):
+    def test_attr(self):
+        def test_dtype():
+            x = np.random.random((200, 3)).astype("float64")
+            dtype = 'uint8'
+            result = paddle.empty_like(x, dtype=dtype)
+
+        self.assertRaises(TypeError, test_dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
index 66baf8faac51ebd19689a5b22b87f3a454842fac..fc57602b445ddd6aa615b47ecce6bf993703d858 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
@@ -21,8 +21,7 @@ from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 6f8af3017efcb9010b129131a01c5ee071b5bc36..b20f33e11b656f1296510df653309a3569d45043 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -86,6 +86,13 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
         self.assertEqual(strategy.localsgd_configs["begin_step"], 120)
 
+    def test_adaptive_localsgd_configs(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {"init_k_steps": 1, "begin_step": 120}
+        strategy.adaptive_localsgd_configs = configs
+        self.assertEqual(strategy.adaptive_localsgd_configs["init_k_steps"], 1)
+        self.assertEqual(strategy.adaptive_localsgd_configs["begin_step"], 120)
+
     def test_dgc(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.dgc = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index 945f5ae57454b2c4a509badb93574a6e03b607e8..f5347b0c665e2a162f7f8210171ec415afee4599 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -52,5 +52,36 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
         optimizer.minimize(avg_cost)
 
 
+class TestFleetAdaptiveLocalSGDMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
+
+    def test_adaptive_localsgd_optimizer(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.adaptive_localsgd = True
+        config = strategy.adaptive_localsgd_configs
+        config['init_k_steps'] = 1
+        config['begin_step'] = 1
+        strategy.adaptive_localsgd_configs = config
+
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
index 6414ef18d635aea4b73c17a4931c37e596ed6029..6cb40eef27e4d93606167232dba3fc181af3c17a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
@@ -40,9 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
             from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
             from paddle.fluid.incubate.fleet.base.role_maker import \
                 GeneralRoleMaker
-            from paddle.distributed.fleet.utils import KVHandler
-            from paddle.distributed.fleet.utils import KVServer
-            from paddle.distributed.fleet.utils import KVHTTPServer
+            from paddle.distributed.fleet.utils.http_server import KVHandler
+            from paddle.distributed.fleet.utils.http_server import KVServer
+            from paddle.distributed.fleet.utils.http_server import KVHTTPServer
         except:
             print("warning: no fleet, skip test_pslib_4")
             return
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
index dde36e073fb20eed3b17c79a886739f59ecb185d..d506088fde0291c1aab7204f5b3ba1a1ab19aa3f 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -81,12 +81,12 @@ class TestFleetUtil(unittest.TestCase):
         self.assertEqual(user_id, 10)
 
     def test_fs(self):
-        from paddle.distributed.fleet.utils import LocalFS
+        from paddle.distributed.fleet.utils.fs import LocalFS
         fs = LocalFS()
         dirs, files = fs.ls_dir("test_tmp")
         dirs, files = fs.ls_dir("./")
         self.assertFalse(fs.need_upload_download())
-        fleet_util.set_file_system(fs)
+        fleet_util._set_file_system(fs)
 
     def test_barrier(self):
         try:
diff --git a/python/paddle/fluid/tests/unittests/test_fs_interface.py b/python/paddle/fluid/tests/unittests/test_fs_interface.py
index c01876531c99c610706265ff646d93c4a197a26e..581fa9738116dc5c737fd3264528b991e210888b 100644
--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -20,7 +20,7 @@ import os
 import sys
 import inspect
 
-from paddle.distributed.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 
 class FSTest(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 1f6e522d2668b5dcd2075ff7af6b4b1ee674632d..5dcce88acf16b96fc26cd56c2e5e034d19405b6c 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -216,7 +216,7 @@ class API_TestGather(unittest.TestCase):
                       "index": index_np,
                       'axis': axis_np},
                 fetch_list=[out])
-            expected_output = gather_numpy(x_np, index_np, axis_np)
+            expected_output = gather_numpy(x_np, index_np, axis_np[0])
         self.assertTrue(np.allclose(result, expected_output))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs1.py b/python/paddle/fluid/tests/unittests/test_hdfs1.py
index 430ed1abe860869d791f0eac17accc8416db1eca..1aac1236156ca159e9b285611a55f38925be22c2 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs1.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs1.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest1(FSTestBase):
     def test_timeout(self):
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py
index 7754f89e3c901ac14cb102881e8d338442038559..1fa019bb9cd02c5f9a7181bc321618d5c37a8755 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs2.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest2(FSTestBase):
     def test_hdfs(self):
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs3.py b/python/paddle/fluid/tests/unittests/test_hdfs3.py
index 1a045f4b17fc9b8b68ccf81a23cb953db58a9db7..218bf12ca608a1fd908b3c6e2533517a960e6a23 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs3.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest3(FSTestBase):
     def test_hdfs(self):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py b/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py
deleted file mode 100644
index 171687283bc5db709501ae33d131470582f4d106..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import numpy as np
-
-
-class MLP(fluid.Layer):
-    def __init__(self, input_size):
-        super(MLP, self).__init__()
-        self._linear1 = fluid.dygraph.Linear(
-            input_size,
-            3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
-        self._linear2 = fluid.dygraph.Linear(
-            3,
-            4,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
-
-    def forward(self, inputs):
-        x = self._linear1(inputs)
-        x = self._linear2(x)
-        x = fluid.layers.reduce_sum(x)
-        return x
-
-
-class TestDygraphDebugString(unittest.TestCase):
-    def test_dygraph_debug_string(self):
-        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        unique_name = 0
-        trace_var = 0
-        alive_var = 0
-        with fluid.dygraph.guard():
-            mlp = MLP(input_size=2)
-            for i in range(10):
-                var_inp = fluid.dygraph.base.to_variable(np_inp)
-                out = mlp(var_inp)
-                out.backward()
-                mlp.clear_gradients()
-                unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg(
-                    mlp.parameters(), is_test=True)
-                if i > 0:
-                    self.assertGreaterEqual(unique_name, unique_name_tmp)
-                    self.assertGreaterEqual(trace_var, trace_var_tmp)
-                    self.assertGreaterEqual(alive_var, alive_var_tmp)
-                else:
-                    unique_name = unique_name_tmp
-                    trace_var = trace_var_tmp
-                    alive_var = alive_var_tmp
-                try:
-                    fluid.dygraph.base._print_debug_msg(mlp.parameters())
-                except Exception as e:
-                    raise RuntimeError(
-                        "No Exception is accepted in _print_debug_msg, but we got: {}".
-                        format(e))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
index 0af8132acfd26917f83dda6350583c7a06858a7d..f2dfaef397797a9d1b7e2900cdb2cc3617bcb933 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
@@ -21,7 +21,6 @@ import numpy as np
 class TestImperativeUsingNonZeroGpu(unittest.TestCase):
     def run_main(self, np_arr, place):
         with guard(place):
-            embedding = Embedding(size=[10, 10])
             var = to_variable(np_arr)
             self.assertTrue(np.array_equal(np_arr, var.numpy()))
 
@@ -30,7 +29,6 @@ class TestImperativeUsingNonZeroGpu(unittest.TestCase):
             return
 
         np_arr = np.random.random([11, 13]).astype('float32')
-        self.run_main(np_arr, fluid.CUDAPlace(1))
         self.run_main(np_arr, fluid.CUDAPlace(0))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index ed1939dbe279f28883d9e33178f1cfa256140e33..a1a9b3f444fa411f90e869f5265fa0933393ff56 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -64,7 +64,7 @@ class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
         self.batch_size = 128
         self.batch_num = 10
 
-    def train_and_save_model(self):
+    def train_and_save_model(self, only_params=False):
         with new_program_scope():
             startup_program = fluid.default_startup_program()
             main_program = fluid.default_main_program()
@@ -102,11 +102,15 @@ class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
                 static_param_dict[param.name] = fluid.executor._fetch_var(
                     param.name)
 
-            fluid.io.save_inference_model(
-                self.save_dirname, ["img"], [prediction],
-                exe,
-                model_filename=self.model_filename,
-                params_filename=self.params_filename)
+            if only_params:
+                fluid.io.save_params(
+                    exe, self.save_dirname, filename=self.params_filename)
+            else:
+                fluid.io.save_inference_model(
+                    self.save_dirname, ["img"], [prediction],
+                    exe,
+                    model_filename=self.model_filename,
+                    params_filename=self.params_filename)
 
         return static_param_dict
 
@@ -120,9 +124,7 @@ class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
         self.params_filename = None
         orig_param_dict = self.train_and_save_model()
 
-        configs = paddle.SaveLoadConfig()
-        configs.separate_params = True
-        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        load_param_dict, _ = paddle.load(self.save_dirname)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
     def test_load_with_model_filename(self):
@@ -160,6 +162,14 @@ class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
         load_param_dict, _ = paddle.load(self.save_dirname, configs)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+    def test_load_state_dict_from_save_params(self):
+        self.save_dirname = "static_mnist.load_state_dict.save_params"
+        self.params_filename = None
+        orig_param_dict = self.train_and_save_model(True)
+
+        load_param_dict, _ = paddle.load(self.save_dirname)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index b0b85f633a2bf613cdbdcc2ba7b31b5d970da8ca..80b201d0842183750361d5e08bab5f78f40a858b 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -67,6 +67,22 @@ class TestSumOp6D(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestSumOp8D(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float64")
+        }
+        self.attrs = {'dim': (0, 3)}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=(0, 3))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
@@ -103,6 +119,40 @@ class TestMinOp(OpTest):
         self.check_output()
 
 
+class TestMin6DOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {
+            'X': np.random.random((2, 4, 3, 5, 6, 10)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMin8DOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {
+            'X': np.random.random((2, 4, 3, 5, 6, 3, 2, 4)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestProdOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_prod"
@@ -116,6 +166,42 @@ class TestProdOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestProd6DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.inputs = {
+            'X': np.random.random((5, 6, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestProd8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestAllOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_all"
@@ -127,12 +213,40 @@ class TestAllOp(OpTest):
         self.check_output()
 
 
+class TestAll8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (2, 3, 4)}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAllOpWithDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_all"
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1]}
-        self.outputs = {'Out': self.inputs['X'].all(axis=1)}
+        self.attrs = {'dim': (1, )}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAll8DOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (1, 3, 4)}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
 
     def test_check_output(self):
         self.check_output()
@@ -152,6 +266,23 @@ class TestAllOpWithKeepDim(OpTest):
         self.check_output()
 
 
+class TestAll8DOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (5, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].all(axis=self.attrs['dim']), axis=5)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAllOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -175,6 +306,20 @@ class TestAnyOp(OpTest):
         self.check_output()
 
 
+class TestAny8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (3, 5, 4)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAnyOpWithDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_any"
@@ -186,14 +331,45 @@ class TestAnyOpWithDim(OpTest):
         self.check_output()
 
 
+class TestAny8DOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (3, 6)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAnyOpWithKeepDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_any"
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAny8DOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
         self.outputs = {
             'Out': np.expand_dims(
-                self.inputs['X'].any(axis=1), axis=1)
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
         }
 
     def test_check_output(self):
@@ -283,6 +459,18 @@ class Test3DReduce3(Test1DReduce):
         }
 
 
+class Test8DReduce0(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': (4, 2, 3)}
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
 class TestKeepDimReduce(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
@@ -294,6 +482,19 @@ class TestKeepDimReduce(Test1DReduce):
         }
 
 
+class TestKeepDim8DReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': (3, 4, 5), 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
 class TestReduceAll(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
@@ -302,6 +503,16 @@ class TestReduceAll(Test1DReduce):
         self.outputs = {'Out': self.inputs['X'].sum()}
 
 
+class TestReduceAll(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (3, 4, 5)}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=self.attrs['dim'])}
+
+
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
diff --git a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
index b74a6e10917f71b669a2d2e6906d6c7310c59c7e..4c63dced83b199acaffc3e703785b3eb8ec8913b 100644
--- a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
+++ b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
@@ -50,7 +50,7 @@ class TestSaveModelWithoutVar(unittest.TestCase):
                 params_filename='params')
             expected_warn = "no variable in your model, please ensure there are any variables in your model to save"
             self.assertTrue(len(w) > 0)
-            self.assertTrue(expected_warn == str(w[0].message))
+            self.assertTrue(expected_warn == str(w[-1].message))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index d5d1fdc5b20b9d786f1861d63bb4a646117b80ed..56333211469db5705d04cc5ca253bf01679190a5 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -99,6 +99,18 @@ class TestCase7(TestTransposeOp):
         self.axis = (0, 1, 3, 2)
 
 
+class TestCase8(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
+
+
+class TestCase9(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
+
+
 class TestTransposeOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 2836a151ec35698a31f3814d573828853349a151..d41852c9d7f4f5812a852d6a5c644e75d137f530 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -792,15 +792,14 @@ class Model(object):
     switched by `paddle.disable_static()`. The usage is as follows.
     But note, the switching between dynamic and static should be before
     instantiating a Model. The input description, i.e, paddle.static.InputSpec,
-    must be required for static graph.
+    must be required.
 
     Args:
         network (paddle.nn.Layer): The network is an instance of
             paddle.nn.Layer.
         inputs (InputSpec|list|dict|None): `inputs`, entry points of network,
             could be a InputSpec instance, or lits of InputSpec instances,
-            or dict ({name: InputSpec}), or None. For static graph,
-            inputs must be set. For dynamic graph, it could be None.
+            or dict ({name: InputSpec}), and it couldn't be None.
         labels (InputSpec|list|None): `labels`, entry points of network,
             could be a InputSpec instnace or lits of InputSpec instances,
             or None. For static graph, if labels is required in loss,
@@ -849,10 +848,9 @@ class Model(object):
         self._optimizer = None
         self._test_dataloader = None
 
-        if not in_dygraph_mode():
-            if not isinstance(inputs, (list, dict, Input)):
-                raise TypeError(
-                    "'inputs' must be list or dict in static graph mode")
+        if not isinstance(inputs, (list, dict, Input)):
+            raise TypeError(
+                "'inputs' must be list or dict, and couldn't be None.")
         self._inputs = self._verify_spec(inputs, True)
         self._labels = self._verify_spec(labels)
 
@@ -1004,11 +1002,7 @@ class Model(object):
         have no variable need to save (like SGD), the fill will not generated).
         This function will silently overwrite existing file at the target location.
 
-        If `training` is set to False, only inference model will be saved. It 
-        should be noted that before using `save`, you should run the model, and 
-        the shape of input you saved is as same as the input of its running.
-        `@paddle.jit.to_static` must be added on `forward` function of your layer 
-        in dynamic mode now and these will be optimized later.
+        If `training` is set to False, only inference model will be saved.
 
         Args:
             path (str): The file prefix to save model. The format is
@@ -1037,8 +1031,6 @@ class Model(object):
                             nn.Linear(200, 10),
                             nn.Softmax())
 
-                    # If save for inference in dygraph, need this
-                    @paddle.jit.to_static
                     def forward(self, x):
                         return self.net(x)
 
@@ -1046,7 +1038,7 @@ class Model(object):
                 device = paddle.set_device('cpu')
                 # if use static graph, do not set
                 paddle.disable_static(device) if dynamic else None
-                # inputs and labels are not required for dynamic graph.
+
                 input = InputSpec([None, 784], 'float32', 'x')
                 label = InputSpec([None, 1], 'int64', 'label')
                 model = paddle.Model(Mnist(), input, label)
@@ -1649,10 +1641,6 @@ class Model(object):
                               model_only=False):
         """
         Save inference model can be in static or dynamic mode.
-        It should be noted that before using `save_inference_model`, you should
-        run the model, and the shape you saved is as same as the input of its
-        running. `@paddle.jit.to_static` must be added on `forward` function of
-        your layer in dynamic mode now and these will be optimized later.
 
         Args:
             save_dir (str): The directory path to save the inference model.
@@ -1678,20 +1666,17 @@ class Model(object):
 
             return result_list
 
-        # TODO:
-        # 1. Make it Unnecessary to run model before calling `save_inference_model` for users in dygraph.
-        # 2. Save correct shape of input, now the interface stores the shape that the user sent to 
-        #    the inputs of the model in running.
-        # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
         if fluid.in_dygraph_mode():
             with fluid.framework._dygraph_guard(None):
                 layer = self.network
+                layer.forward = paddle.jit.to_static(
+                    layer.forward, input_spec=self._inputs)
 
                 # 1. input check
                 prog_translator = ProgramTranslator()
-                if not prog_translator.enable_declarative:
+                if not prog_translator.enable_to_static:
                     raise RuntimeError(
-                        "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
+                        "save_inference_model doesn't work when setting ProgramTranslator.enable to False."
                     )
                 if not isinstance(layer, Layer):
                     raise TypeError(
@@ -1879,18 +1864,7 @@ class Model(object):
     def _verify_spec(self, specs, is_input=False):
         out_specs = []
 
-        if specs is None:
-            # Note(Aurelius84): If not specific specs of `Input`, using argument names of `forward` function
-            # to generate `Input`. But how can we know the actual shape of each input tensor?
-            if is_input:
-                out_specs = [
-                    Input(
-                        name=n, shape=[None])
-                    for n in extract_args(self.network.forward) if n != 'self'
-                ]
-            else:
-                out_specs = to_list(specs)
-        elif isinstance(specs, dict):
+        if isinstance(specs, dict):
             assert is_input == False
             out_specs = [specs[n] \
                 for n in extract_args(self.network.forward) if n != 'self']
@@ -1902,8 +1876,8 @@ class Model(object):
                 assert isinstance(spec, Input)
                 if spec.name is None:
                     raise ValueError(
-                        "Requires Input[{}].name != None, but receive `None` with {}.".
-                        format(i, spec))
+                        "Requires Input[{}].name != None, but receive `None` with {}."
+                        .format(i, spec))
 
         return out_specs
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index da086c0955e849619ccbce17a297ca4615a3f3d0..4395520eec70e8483cb61097a166576f4040cb4d 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1093,7 +1093,7 @@ def cross_entropy(input,
             " 'none', but received %s, which is not allowed." % reduction)
 
     #step 1. log_softmax
-    log_softmax_out = paddle.nn.functional.log_softmax(input)
+    log_softmax_out = paddle.nn.functional.log_softmax(input, axis=1)
     if weight is not None and not isinstance(weight, Variable):
         raise ValueError(
             "The weight' is not a Variable, please convert to Variable.")
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index ff09f4c562aeb8216dea6e1d40b9492c257aeab4..aadfb3f49ed61367b9502e1a00ad5b9c027a32b7 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -32,6 +32,21 @@ import random
 import zlib
 import paddle.compat as cpt
 
+# On macOS, the 'spawn' start method is now the default in Python3.8 multiprocessing,
+# Paddle is currently unable to solve this, so forces the process to start using 
+# the 'fork' start method.
+#
+# TODO: This solution is not good, because the fork start method could lead to 
+# crashes of the subprocess. Figure out how to make 'spawn' work.
+#
+# For more details, please refer to
+# https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+# https://bugs.python.org/issue33725
+if sys.version_info >= (3, 8):
+    fork_context = multiprocessing.get_context('fork')
+else:
+    fork_context = multiprocessing
+
 
 def cache(reader):
     """
@@ -560,9 +575,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
             six.reraise(*sys.exc_info())
 
     def queue_reader():
-        queue = multiprocessing.Queue(queue_size)
+        queue = fork_context.Queue(queue_size)
         for reader in readers:
-            p = multiprocessing.Process(
+            p = fork_context.Process(
                 target=_read_into_queue, args=(reader, queue))
             p.start()
 
@@ -593,9 +608,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
     def pipe_reader():
         conns = []
         for reader in readers:
-            parent_conn, child_conn = multiprocessing.Pipe()
+            parent_conn, child_conn = fork_context.Pipe()
             conns.append(parent_conn)
-            p = multiprocessing.Process(
+            p = fork_context.Process(
                 target=_read_into_pipe, args=(reader, child_conn))
             p.start()
 
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 8bb584be2362e7b02bc5b7c5603b148d37499c2d..a713663e1822d4af2d09efb2986aeb513930bbc0 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -41,6 +41,7 @@ from .creation import triu  #DEFINE_ALIAS
 from .creation import tril  #DEFINE_ALIAS
 from .creation import meshgrid  #DEFINE_ALIAS
 from .creation import empty  #DEFINE_ALIAS
+from .creation import empty_like  #DEFINE_ALIAS
 from .io import save  #DEFINE_ALIAS
 from .io import load  #DEFINE_ALIAS
 from .linalg import matmul  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 8011b92964b7e21fd930f19cec954b27f470e0c6..9aee911e568d1b2cd7aac0cf45e44f2886612a5a 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -49,6 +49,7 @@ __all__ = [
     'full',
     'full_like',
     'empty',
+    'empty_like',
     'triu',
     'tril',
     'meshgrid'
@@ -1068,3 +1069,70 @@ def empty(shape, dtype=None, name=None):
         stop_gradient=True)
     out.stop_gradient = True
     return out
+
+
+def empty_like(x, dtype=None, name=None):
+    """
+    This Op returns a Tensor with uninitialized data which has identical shape of ``x`` and ``dtype``.
+    If the ``dtype`` is None, the data type of Tensor is same with ``x``.
+    
+    Args:
+        x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
+        dtype(np.dtype|str, optional): The data type of output. The data type can be one
+            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
+            data type is the same as input.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        Tensor: Tensor which is created according to ``x`` and ``dtype``, and is uninitialized.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()   # Now we are in imperative mode
+          paddle.set_device("cpu")  # and use cpu device
+
+          x = paddle.randn([2, 3], 'float32')
+          output = paddle.empty_like(x)
+          #[[1.8491974e+20 1.8037303e+28 1.7443726e+28]     # uninitialized
+          # [4.9640171e+28 3.0186127e+32 5.6715899e-11]]    # uninitialized
+    """
+
+    if dtype is None:
+        dtype = x.dtype
+    dtype = convert_dtype(dtype)
+
+    if in_dygraph_mode():
+        out = core.ops.empty('shape', x.shape, 'dtype',
+                             convert_np_dtype_to_dtype_(dtype))
+        out.stop_gradient = True
+        return out
+
+    helper = LayerHelper("empty_like", **locals())
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'empty_like')
+    check_dtype(dtype, 'dtype',
+                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                'empty_like')
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+
+    inputs = {}
+    attrs = {}
+    attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
+    shape = paddle.shape(x)
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='empty_like')
+
+    helper.append_op(
+        type='empty',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs,
+        stop_gradient=True)
+    out.stop_gradient = True
+    return out
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 5c4e98feaa686217bc78ad3915423593ad4fcdce..62cc39c1f7b5303d98bffd9eb5814d4579a6d3f1 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -67,35 +67,6 @@ class LeNetDygraph(paddle.nn.Layer):
         return x
 
 
-class LeNetDeclarative(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10, classifier_activation=None):
-        super(LeNetDeclarative, self).__init__()
-        self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2d(
-                1, 6, 3, stride=1, padding=1),
-            ReLU(),
-            Pool2D(2, 'max', 2),
-            Conv2d(
-                6, 16, 5, stride=1, padding=0),
-            ReLU(),
-            Pool2D(2, 'max', 2))
-
-        if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
-
-    @declarative
-    def forward(self, inputs):
-        x = self.features(inputs)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.fc(x)
-        return x
-
-
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -444,7 +415,9 @@ class TestModelFunction(unittest.TestCase):
         # dynamic saving
         device = paddle.set_device('cpu')
         fluid.enable_dygraph(device)
-        model = Model(MyModel(classifier_activation=None))
+        inputs = [InputSpec([None, 20], 'float32', 'x')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
+        model = Model(MyModel(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -543,11 +516,10 @@ class TestModelFunction(unittest.TestCase):
 
     def test_export_deploy_model(self):
         for dynamic in [True, False]:
-            fluid.enable_dygraph() if dynamic else None
-            # paddle.disable_static() if dynamic else None
+            paddle.disable_static() if dynamic else None
             prog_translator = ProgramTranslator()
             prog_translator.enable(False) if not dynamic else None
-            net = LeNetDeclarative()
+            net = LeNet()
             inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
@@ -556,8 +528,9 @@ class TestModelFunction(unittest.TestCase):
                 os.makedirs(save_dir)
             tensor_img = np.array(
                 np.random.random((1, 1, 28, 28)), dtype=np.float32)
-            ori_results = model.test_batch(tensor_img)
+
             model.save(save_dir, training=False)
+            ori_results = model.test_batch(tensor_img)
             fluid.disable_dygraph() if dynamic else None
 
             place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
@@ -574,6 +547,7 @@ class TestModelFunction(unittest.TestCase):
                 np.testing.assert_allclose(
                     results, ori_results, rtol=1e-5, atol=1e-7)
                 shutil.rmtree(save_dir)
+            paddle.enable_static()
 
 
 class TestRaiseError(unittest.TestCase):
@@ -585,6 +559,14 @@ class TestRaiseError(unittest.TestCase):
         with self.assertRaises(ValueError):
             model = Model(net, inputs, labels)
 
+    def test_input_without_input_spec(self):
+        for dynamic in [True, False]:
+            paddle.disable_static() if dynamic else None
+            net = MyModel(classifier_activation=None)
+            with self.assertRaises(TypeError):
+                model = Model(net)
+            paddle.enable_static()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/requirements.txt b/python/requirements.txt
index c8d3b2af1794bb0858b187d6a4c641322f50cdd1..478884247553fbe44ba0295e6398d086cfcd9e41 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -13,11 +13,10 @@ scipy ; python_version>"3.5"
 nltk ; python_version>="3.5"
 rarfile
 Pillow
-graphviz
 six
 decorator
 prettytable
-objgraph
 astor
 pathlib
-netifaces
+netifaces ; platform_system != "Windows"
+netifaces ; python_version>="3.5" and platform_system == "Windows"
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..af558c2ef0b42b68e47fe98ebd626c9b9034bef9
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,577 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import os
+import os.path
+import errno
+import re
+import shutil
+import sys
+import fnmatch
+import errno
+import platform
+
+from contextlib import contextmanager
+from setuptools import Command
+from setuptools import setup, Distribution, Extension
+from setuptools.command.install import install as InstallCommandBase
+
+
+class BinaryDistribution(Distribution):
+    def has_ext_modules(foo):
+        return True
+
+
+RC = 0
+
+ext_name = '.dll' if os.name == 'nt' else ('.dylib' if sys.platform == 'darwin'
+                                           else '.so')
+
+
+def git_commit():
+    try:
+        cmd = ['git', 'rev-parse', 'HEAD']
+        git_commit = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE,
+            cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
+    except:
+        git_commit = 'Unknown'
+    git_commit = git_commit.decode()
+    return str(git_commit)
+
+
+def _get_version_detail(idx):
+    assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
+        so detail index must less than 3"
+
+    if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
+        version_details = '@PADDLE_VERSION@'.split('.')
+
+        if len(version_details) >= 3:
+            return version_details[idx]
+
+    return 0
+
+
+def get_major():
+    return int(_get_version_detail(0))
+
+
+def get_minor():
+    return int(_get_version_detail(1))
+
+
+def get_patch():
+    return str(_get_version_detail(2))
+
+
+def is_taged():
+    try:
+        cmd = [
+            'git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null'
+        ]
+        git_tag = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE,
+            cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
+        git_tag = git_tag.decode()
+    except:
+        return False
+
+    if str(git_tag).replace('v', '') == '@PADDLE_VERSION@':
+        return True
+    else:
+        return False
+
+
+def write_version_py(filename='paddle/version.py'):
+    cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+full_version    = '%(major)d.%(minor)d.%(patch)s'
+major           = '%(major)d'
+minor           = '%(minor)d'
+patch           = '%(patch)s'
+rc              = '%(rc)d'
+istaged         = %(istaged)s
+commit          = '%(commit)s'
+with_mkl        = '%(with_mkl)s'
+
+def show():
+    if istaged:
+        print('full_version:', full_version)
+        print('major:', major)
+        print('minor:', minor)
+        print('patch:', patch)
+        print('rc:', rc)
+    else:
+        print('commit:', commit)
+
+def mkl():
+    return with_mkl
+'''
+    commit = git_commit()
+    with open(filename, 'w') as f:
+        f.write(cnt % {
+            'major': get_major(),
+            'minor': get_minor(),
+            'patch': get_patch(),
+            'rc': RC,
+            'version': '${PADDLE_VERSION}',
+            'commit': commit,
+            'istaged': is_taged(),
+            'with_mkl': '@WITH_MKL@'
+        })
+
+
+write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
+
+
+def write_distributed_training_mode_py(
+        filename='paddle/fluid/incubate/fleet/parameter_server/version.py'):
+    cnt = '''from __future__ import print_function
+
+# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+
+from paddle.fluid.incubate.fleet.base.mode import Mode
+
+BUILD_MODE=Mode.%(mode)s
+
+def is_transpiler():
+    return Mode.TRANSPILER == BUILD_MODE
+
+'''
+
+    dirname = os.path.dirname(filename)
+
+    try:
+        os.makedirs(dirname)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+    with open(filename, 'w') as f:
+        f.write(cnt %
+                {'mode': 'PSLIB' if '${WITH_PSLIB}' == 'ON' else 'TRANSPILER'})
+
+
+write_distributed_training_mode_py(
+    filename='@PADDLE_BINARY_DIR@/python/paddle/fluid/incubate/fleet/parameter_server/version.py'
+)
+
+packages = [
+    'paddle',
+    'paddle.libs',
+    'paddle.utils',
+    'paddle.dataset',
+    'paddle.reader',
+    'paddle.distributed',
+    'paddle.incubate',
+    'paddle.incubate.complex',
+    'paddle.incubate.complex.tensor',
+    'paddle.distributed.fleet',
+    'paddle.distributed.fleet.base',
+    'paddle.distributed.fleet.meta_optimizers',
+    'paddle.distributed.fleet.runtime',
+    'paddle.distributed.fleet.dataset',
+    'paddle.distributed.fleet.metrics',
+    'paddle.distributed.fleet.proto',
+    'paddle.distributed.fleet.utils',
+    'paddle.framework',
+    'paddle.jit',
+    'paddle.fluid',
+    'paddle.fluid.inference',
+    'paddle.fluid.dygraph',
+    'paddle.fluid.dygraph.dygraph_to_static',
+    'paddle.fluid.dygraph.amp',
+    'paddle.fluid.proto',
+    'paddle.fluid.proto.profiler',
+    'paddle.fluid.distributed',
+    'paddle.fluid.layers',
+    'paddle.fluid.dataloader',
+    'paddle.fluid.contrib',
+    'paddle.fluid.contrib.decoder',
+    'paddle.fluid.contrib.quantize',
+    'paddle.fluid.contrib.reader',
+    'paddle.fluid.contrib.slim',
+    'paddle.fluid.contrib.slim.quantization',
+    'paddle.fluid.contrib.slim.quantization.imperative',
+    'paddle.fluid.contrib.utils',
+    'paddle.fluid.contrib.extend_optimizer',
+    'paddle.fluid.contrib.mixed_precision',
+    'paddle.fluid.contrib.layers',
+    'paddle.fluid.transpiler',
+    'paddle.fluid.transpiler.details',
+    'paddle.fluid.incubate',
+    'paddle.fluid.incubate.data_generator',
+    'paddle.fluid.incubate.fleet',
+    'paddle.fluid.incubate.checkpoint',
+    'paddle.fluid.incubate.fleet.base',
+    'paddle.fluid.incubate.fleet.parameter_server',
+    'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
+    'paddle.fluid.incubate.fleet.parameter_server.pslib',
+    'paddle.fluid.incubate.fleet.parameter_server.ir',
+    'paddle.fluid.incubate.fleet.collective',
+    'paddle.fluid.incubate.fleet.utils',
+    'paddle.hapi',
+    'paddle.vision',
+    'paddle.vision.models',
+    'paddle.vision.transforms',
+    'paddle.vision.datasets',
+    'paddle.text',
+    'paddle.text.datasets',
+    'paddle.incubate',
+    'paddle.io',
+    'paddle.optimizer',
+    'paddle.nn',
+    'paddle.nn.functional',
+    'paddle.nn.layer',
+    'paddle.nn.initializer',
+    'paddle.nn.utils',
+    'paddle.metric',
+    'paddle.static',
+    'paddle.static.nn',
+    'paddle.tensor',
+]
+
+with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
+    setup_requires = f.read().splitlines()
+
+# Note(wangzhongpu):
+# When compiling paddle under python36, the dependencies belonging to python2.7 will be imported, resulting in errors when installing paddle
+if sys.version_info >= (3, 6) and sys.version_info < (3, 7):
+    setup_requires_tmp = []
+    for setup_requires_i in setup_requires:
+        if "<\"3.6\"" in setup_requires_i or "<\"3.5\"" in setup_requires_i or "<=\"3.5\"" in setup_requires_i:
+            continue
+        setup_requires_tmp += [setup_requires_i]
+    setup_requires = setup_requires_tmp
+if sys.version_info >= (3, 5) and sys.version_info < (3, 6):
+    setup_requires_tmp = []
+    for setup_requires_i in setup_requires:
+        if "<\"3.5\"" in setup_requires_i:
+            continue
+        setup_requires_tmp += [setup_requires_i]
+    setup_requires = setup_requires_tmp
+if sys.version_info >= (3, 7):
+    setup_requires_tmp = []
+    for setup_requires_i in setup_requires:
+        if "<\"3.6\"" in setup_requires_i or "<=\"3.6\"" in setup_requires_i or "<\"3.5\"" in setup_requires_i or "<=\"3.5\"" in setup_requires_i or "<\"3.7\"" in setup_requires_i:
+            continue
+        setup_requires_tmp += [setup_requires_i]
+    setup_requires = setup_requires_tmp
+
+if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
+    setup_requires += ['opencv-python']
+
+# the prefix is sys.prefix which should always be usr
+paddle_bins = ''
+
+if not '${WIN32}':
+    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
+package_data = {
+    'paddle.fluid':
+    ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]
+}
+if '${HAS_NOAVX_CORE}' == 'ON':
+    package_data['paddle.fluid'] += [
+        'core_noavx' + ('.so' if os.name != 'nt' else '.pyd')
+    ]
+
+package_dir = {
+    '': '${PADDLE_BINARY_DIR}/python',
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
+    'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
+    'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
+}
+
+# put all thirdparty libraries in paddle.libs
+libs_path = '${PADDLE_BINARY_DIR}/python/paddle/libs'
+
+package_data['paddle.libs'] = []
+package_data['paddle.libs'] = [('libwarpctc'
+                                if os.name != 'nt' else 'warpctc') + ext_name]
+shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
+    package_data['paddle.libs'] += [
+        ('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name,
+        ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name
+    ]
+else:
+    if os.name == 'nt':
+        # copy the openblas.dll
+        shutil.copy('${OPENBLAS_SHARED_LIB}', libs_path)
+        package_data['paddle.libs'] += ['openblas' + ext_name]
+
+if '${WITH_LITE}' == 'ON':
+    shutil.copy('${LITE_SHARED_LIB}', libs_path)
+    package_data['paddle.libs'] += ['libpaddle_full_api_shared' + ext_name]
+
+if '${WITH_PSLIB}' == 'ON':
+    shutil.copy('${PSLIB_LIB}', libs_path)
+    if os.path.exists('${PSLIB_VERSION_PY}'):
+        shutil.copy(
+            '${PSLIB_VERSION_PY}',
+            '${PADDLE_BINARY_DIR}/python/paddle/fluid/incubate/fleet/parameter_server/pslib/'
+        )
+    package_data['paddle.libs'] += ['libps' + ext_name]
+
+if '${WITH_MKLDNN}' == 'ON':
+    if '${CMAKE_BUILD_TYPE}' == 'Release' and os.name != 'nt':
+        # only change rpath in Release mode.
+        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
+        # we can support mkl on mac.
+        #
+        # change rpath of libdnnl.so.1, add $ORIGIN/ to it.
+        # The reason is that all thirdparty libraries in the same directory,
+        # thus, libdnnl.so.1 will find libmklml_intel.so and libiomp5.so.
+        command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
+        if os.system(command) != 0:
+            raise Exception("patch libdnnl.so failed, command: %s" % command)
+    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+    if os.name != 'nt':
+        shutil.copy('${MKLDNN_SHARED_LIB_1}', libs_path)
+        package_data['paddle.libs'] += ['libmkldnn.so.0', 'libdnnl.so.1']
+    else:
+        package_data['paddle.libs'] += ['mkldnn.dll']
+
+if '${WITH_XPU}' == 'ON':
+    # only change rpath in Release mode,
+    if '${CMAKE_BUILD_TYPE}' == 'Release':
+        if os.name != 'nt':
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/\" ${XPU_API_LIB}"
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/' ${XPU_API_LIB}"
+            if os.system(command) != 0:
+                raise Exception("patch ${XPU_API_LIB} failed, command: %s" %
+                                command)
+    shutil.copy('${XPU_API_LIB}', libs_path)
+    shutil.copy('${XPU_RT_LIB}', libs_path)
+    shutil.copy('${XPU_SIM_LIB}', libs_path)
+    package_data['paddle.libs'] += [
+        '${XPU_API_LIB_NAME}', '${XPU_RT_LIB_NAME}', '${XPU_SIM_LIB_NAME}'
+    ]
+
+# copy libfuild_framework.so to libs
+if os.name != 'nt' and sys.platform != 'darwin':
+    paddle_framework_lib = '${FLUID_FRAMEWORK_SHARED_LIB}'
+    shutil.copy(paddle_framework_lib, libs_path)
+    package_data['paddle.libs'] += [
+        ('libpaddle_framework'
+         if os.name != 'nt' else 'paddle_framework') + ext_name
+    ]
+
+# remove unused paddle/libs/__init__.py
+if os.path.isfile(libs_path + '/__init__.py'):
+    os.remove(libs_path + '/__init__.py')
+package_dir['paddle.libs'] = libs_path
+
+# change rpath of ${FLUID_CORE_NAME}.ext, add $ORIGIN/../libs/ to it.
+# The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and
+# ${FLUID_CORE_NAME}.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
+# This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
+if '${CMAKE_BUILD_TYPE}' == 'Release':
+    if os.name != 'nt':
+        # only change rpath in Release mode, since in Debug mode, ${FLUID_CORE_NAME}.xx is too large to be changed.
+        if "@APPLE@" == "1":
+            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
+        else:
+            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
+        # The dynamic library compiled under aarch64 is greater than 64M,
+        # and an oversize error will be reported when using patchelf.
+        if platform.machine() != 'aarch64':
+            if os.system(command) != 0:
+                raise Exception(
+                    "patch ${FLUID_CORE_NAME}.%s failed, command: %s" %
+                    (ext_name, command))
+
+ext_modules = [Extension('_foo', ['stub.cc'])]
+if os.name == 'nt':
+    # fix the path separator under windows
+    fix_package_dir = {}
+    for k, v in package_dir.items():
+        fix_package_dir[k] = v.replace('/', '\\')
+    package_dir = fix_package_dir
+    ext_modules = []
+elif sys.platform == 'darwin':
+    ext_modules = []
+
+
+def find_files(pattern, root):
+    for dirpath, _, files in os.walk(root):
+        for filename in fnmatch.filter(files, pattern):
+            yield os.path.join(dirpath, filename)
+
+
+headers = (
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/framework')) +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/imperative')) +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/memory')) +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')) +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
+    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
+    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
+    list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}'))
+    +  # errorMessage.pb for errormessage
+    ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] +  # eigen
+    list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) +  # eigen
+    list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) +  # eigen
+    list(find_files('*', '${GFLAGS_INSTALL_DIR}/include')) +  # gflags
+    list(find_files('*', '${GLOG_INSTALL_DIR}/include')) +  # glog
+    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')) +  # boost
+    list(find_files('*', '${XXHASH_INSTALL_DIR}/include')) +  # xxhash
+    list(find_files('*', '${PROTOBUF_INCLUDE_DIR}')) +  # protobuf
+    list(find_files('*', '${DLPACK_INCLUDE_DIR}')) +  # dlpack
+    list(find_files('*.h', '${THREADPOOL_INCLUDE_DIR}')))  # threadpool
+
+if '${WITH_MKLDNN}' == 'ON':
+    headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include'))  # mkldnn
+
+if '${WITH_GPU}' == 'ON':
+    headers += list(find_files(
+        '*.pb', '${cudaerror_INCLUDE_DIR}'))  # errorMessage.pb for errormessage
+
+
+class InstallCommand(InstallCommandBase):
+    def finalize_options(self):
+        ret = InstallCommandBase.finalize_options(self)
+        self.install_headers = os.path.join(self.install_purelib, 'paddle',
+                                            'include')
+        self.install_lib = self.install_platlib
+        return ret
+
+
+class InstallHeaders(Command):
+    """Override how headers are copied.
+    """
+    description = 'install C/C++ header files'
+
+    user_options = [
+        ('install-dir=', 'd', 'directory to install header files to'),
+        ('force', 'f', 'force installation (overwrite existing files)'),
+    ]
+
+    boolean_options = ['force']
+
+    def initialize_options(self):
+        self.install_dir = None
+        self.force = 0
+        self.outfiles = []
+
+    def finalize_options(self):
+        self.set_undefined_options(
+            'install', ('install_headers', 'install_dir'), ('force', 'force'))
+
+    def mkdir_and_copy_file(self, header):
+        if 'pb.h' in header:
+            install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header)
+        elif 'third_party' not in header:
+            # framework
+            install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
+        else:
+            # third_party
+            install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
+            patterns = [
+                'eigen3/src/extern_eigen3', 'boost/src/extern_boost',
+                'dlpack/src/extern_dlpack/include', 'install/protobuf/include',
+                'install/gflags/include', 'install/glog/include',
+                'install/xxhash/include', 'install/mkldnn/include',
+                'threadpool/src/extern_threadpool'
+            ]
+            for pattern in patterns:
+                install_dir = re.sub(pattern, '', install_dir)
+        install_dir = os.path.join(self.install_dir,
+                                   os.path.dirname(install_dir))
+        if not os.path.exists(install_dir):
+            self.mkpath(install_dir)
+        return self.copy_file(header, install_dir)
+
+    def run(self):
+        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
+        if os.name == 'nt' or sys.platform == 'darwin':
+            if '${WITH_GPU}' == 'ON':
+                self.mkdir_and_copy_file(
+                    '${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
+            return
+        hdrs = self.distribution.headers
+        if not hdrs:
+            return
+        self.mkpath(self.install_dir)
+        for header in hdrs:
+            (out, _) = self.mkdir_and_copy_file(header)
+            self.outfiles.append(out)
+
+    def get_inputs(self):
+        return self.distribution.headers or []
+
+    def get_outputs(self):
+        return self.outfiles
+
+
+# we redirect setuptools log for non-windows
+if sys.platform != 'win32':
+
+    @contextmanager
+    def redirect_stdout():
+        f_log = open('${SETUP_LOG_FILE}', 'w')
+        origin_stdout = sys.stdout
+        sys.stdout = f_log
+        yield
+        f_log = sys.stdout
+        sys.stdout = origin_stdout
+        f_log.close()
+else:
+
+    @contextmanager
+    def redirect_stdout():
+        yield
+
+
+if '${WITH_GPU}' == 'ON':
+    os.environ['PACKAGE_NAME'] = "paddlepaddle-gpu"
+else:
+    os.environ['PACKAGE_NAME'] = "paddlepaddle"
+
+with redirect_stdout():
+    setup(
+        name='${PACKAGE_NAME}',
+        version='${PADDLE_VERSION}',
+        description='Parallel Distributed Deep Learning',
+        install_requires=setup_requires,
+        packages=packages,
+        ext_modules=ext_modules,
+        package_data=package_data,
+        package_dir=package_dir,
+        scripts=paddle_bins,
+        distclass=BinaryDistribution,
+        headers=headers,
+        cmdclass={
+            'install_headers': InstallHeaders,
+            'install': InstallCommand,
+        },
+        entry_points={
+            'console_scripts':
+            ['fleetrun = paddle.distributed.fleet.launch:launch']
+        })
+
+# As there are a lot of files in purelib which causes many logs,
+# we don't print them on the screen, and you can open `setup.py.log`
+# for the full logs.
+if os.path.exists('${SETUP_LOG_FILE}'):
+    os.system('grep -v "purelib" ${SETUP_LOG_FILE}')
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
new file mode 100644
index 0000000000000000000000000000000000000000..970f89551c579b1554db8f878c63dace0d715930
--- /dev/null
+++ b/tools/get_pr_ut.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" For the PR that only modified the unit test, get cases in pull request. """
+
+import os
+import json
+from github import Github
+
+PADDLE_ROOT = os.getenv('PADDLE_ROOT', '/paddle/')
+
+
+class PRChecker(object):
+    """ PR Checker. """
+
+    def __init__(self):
+        self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
+        self.repo = self.github.get_repo('PaddlePaddle/Paddle')
+        self.pr = None
+
+    def init(self):
+        """ Get pull request. """
+        pr_id = os.getenv('GIT_PR_ID')
+        if not pr_id:
+            print('No PR ID')
+            exit(0)
+        self.pr = self.repo.get_pull(int(pr_id))
+
+    def get_pr_files(self):
+        """ Get files in pull request. """
+        page = 0
+        file_list = []
+        while True:
+            files = self.pr.get_files().get_page(page)
+            if not files:
+                break
+            for f in files:
+                file_list.append(PADDLE_ROOT + f.filename)
+            page += 1
+        return file_list
+
+    def get_pr_ut(self):
+        """ Get unit tests in pull request. """
+        ut_list = []
+        file_ut_map = None
+        cmd = 'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/file_ut.json'
+        os.system(cmd)
+        with open('file_ut.json') as jsonfile:
+            file_ut_map = json.load(jsonfile)
+        for f in self.get_pr_files():
+            if f not in file_ut_map:
+                return ''
+            if f.endswith('.h') or f.endswith('.cu'):
+                return ''
+            else:
+                ut_list.extend(file_ut_map.get(f))
+        ut_list = list(set(ut_list))
+        return ' '.join(ut_list)
+
+
+if __name__ == '__main__':
+    pr_checker = PRChecker()
+    pr_checker.init()
+    print(pr_checker.get_pr_ut())