Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize/large_scale_kv_spped

c419044d · seiriosPlus · deceefba · fd7ab4e6 · c419044d · c419044d
109 changed file
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -441,6 +441,7 @@ class SectionWorker : public DeviceWorker {
    skip_vars_ = skip_vars;
  }
  static void ResetBatchId() { batch_id_ = 0; }
+  static void ResetThreadCompletedFlag() { threads_completed = false; }
  static std::atomic<int> cpu_id_;

--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -225,3 +226,14 @@ REGISTER_PASS(conv_affine_channel_fuse_pass,
              paddle::framework::ir::ConvAffineChannelFusePass);
 REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
              paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
+REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("affine_channel", 0));
+REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("affine_channel", 0));
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -372,3 +373,14 @@ REGISTER_PASS(depthwise_conv_bn_fuse_pass,
              paddle::framework::ir::DepthwiseConvBNFusePass);
 REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
              paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
+REGISTER_PASS_CAPABILITY(conv_bn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("batch_norm", 0));
+REGISTER_PASS_CAPABILITY(conv_eltwiseadd_bn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("batch_norm", 0));
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -11,9 +11,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
 #include <string>
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -116,3 +116,10 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
 REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
              paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add2_act_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("identity", 0));
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -102,3 +103,10 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
 REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
              paddle::framework::ir::ConvElementwiseAddActFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_act_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("identity", 0));
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <string>
 #include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
+#include <string>
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -89,3 +89,8 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
 REGISTER_PASS(conv_elementwise_add_fuse_pass,
              paddle::framework::ir::ConvElementwiseAddFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -334,3 +335,8 @@ void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
 REGISTER_PASS(embedding_eltwise_layernorm_fuse_pass,
              paddle::framework::ir::EmbeddingEltwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("lookup_table", 0)
+            .EQ("elementweise_add", 0));
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
@@ -16,12 +16,13 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
 namespace ir {
-TEST(SkipLayerNormFusePass, basic) {
+TEST(EmbeddingElewiseLayernormFusePass, basic) {
  // inputs                           operator            output
  // --------------------------------------------------------------------
  // (x, y)                       elementwise_add    -> elementwise_out
@@ -91,6 +92,12 @@ TEST(SkipLayerNormFusePass, basic) {
          "The number of fusion nodes does not meet expectations after fuse"));
 }
+TEST(EmbeddingElewiseLayernormFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("embedding_eltwise_layernorm_fuse_pass"));
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
@@ -84,6 +85,19 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
      VLOG(3) << "do not perform " + type() + "+bias fuse";
      return;
    }
+    if (conv->Op()->HasAttr("dilations")) {
+      auto dilations =
+          BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
+      for (const auto& d : dilations) {
+        if (d != 1) {
+          LOG(WARNING)
+              << "dilation conv not supported in MKLDNN, fuse not apply "
+              << "and set conv attribute use_mkldnn = false";
+          conv->Op()->SetAttr("use_mkldnn", false);
+          return;
+        }
+      }
+    }
    auto* eltwise_bias_tensor =
        scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
@@ -151,3 +165,8 @@ REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
              paddle::framework::ir::Conv2DTransposeBiasFusePass);
 REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
              paddle::framework::ir::Conv3DBiasFusePass);
+REGISTER_PASS_CAPABILITY(conv_bias_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/imperative/type_defs.h"
 namespace paddle {
@@ -149,6 +150,12 @@ TEST(ConvBiasFusePass, conv2d_transpose) {
  ASSERT_EQ(pass.type(), std::string("conv2d_transpose"));
 }
+TEST(ConvBiasFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("conv_bias_mkldnn_fuse_pass"));
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include <tuple>
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -341,3 +342,8 @@ void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
 REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
              paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -267,6 +268,12 @@ TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
  AssertOpsCount(graph, 2, 1);
 }
+TEST(ConvElementwiseAddMKLDNNFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("conv_elementwise_add_mkldnn_fuse_pass"));
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -57,3 +58,7 @@ void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
 REGISTER_PASS(depthwise_conv_mkldnn_pass,
              paddle::framework::ir::DepthwiseConvMKLDNNPass);
+REGISTER_PASS_CAPABILITY(depthwise_conv_mkldnn_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "depthwise_conv2d", 0));
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
@@ -16,6 +16,8 @@
 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -70,6 +72,12 @@ ProgramDesc BuildProgramDesc() {
  return prog;
 }
+TEST(DepthwiseConvMKLDNNPass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("depthwise_conv_mkldnn_pass"));
+}
 TEST(DepthwiseConvMKLDNNPass, basic) {
  auto prog = BuildProgramDesc();

--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/errors.h"
 namespace paddle {
@@ -707,3 +708,13 @@ REGISTER_PASS(multihead_matmul_fuse_pass,
 REGISTER_PASS(multihead_matmul_fuse_pass_v2,
              paddle::framework::ir::MultiHeadMatmulV2FusePass);
+REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .EQ("matmul", 0)
+            .EQ("softmax", 0));
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"  // NOLINT
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -133,6 +134,12 @@ TEST(MultiHeadMatmulFusePass, basic) {
                        num_fused_nodes_after));
 }
+TEST(MultiHeadMatmulFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("multihead_matmul_fuse_pass_v2"));
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #define MAX_NUM_FC 10
@@ -174,6 +175,10 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
            if (x->outputs.size() <= 0 || x->inputs.size() <= 0U) {
              return false;
            }
+            if (x->IsVar() && x->Var() && x->Var()->GetShape().size() > 2) {
+              LOG(WARNING) << "repeated fc relu only supports input dims = 2";
+              return false;
+            }
            int fc_idx = FindFCIdx(x);
            if (fc_idx < 0) {
              return false;
@@ -384,3 +389,8 @@ void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const {
 REGISTER_PASS(repeated_fc_relu_fuse_pass,
              paddle::framework::ir::RepeatedFCReluFusePass);
+REGISTER_PASS_CAPABILITY(repeated_fc_relu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc", 0)
+            .EQ("relu", 0));
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -34,6 +35,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
  const std::string pattern_name = "shufflechannel_pattern";
  FusePassBase::Init(pattern_name, graph);
+  LOG(WARNING) << "There is fluid.layers.shuffle_channel API already, you can "
+                  "use it instead of (reshape + transpose +reshape)";
  GraphPatternDetector gpd;
  auto* x = gpd.mutable_pattern()
                ->NewNode("x")
@@ -93,3 +96,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
 REGISTER_PASS(shuffle_channel_detect_pass,
              paddle::framework::ir::ShuffleChannelDetectPass);
+REGISTER_PASS_CAPABILITY(shuffle_channel_detect_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0));
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -180,3 +181,8 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
 REGISTER_PASS(skip_layernorm_fuse_pass,
              paddle::framework::ir::SkipLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(skip_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("elementwise_add", 0)
+            .EQ("layer_norm", 0));
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -54,6 +55,12 @@ TEST(SkipLayerNormFusePass, basic) {
          "The number of fusion nodes does not meet expectations after fuse"));
 }
+TEST(SkipLayerNormFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("skip_layernorm_fuse_pass"));
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -251,6 +251,7 @@ void PipelineTrainer::Finalize() {
  }
  root_scope_->DropKids();
  SectionWorker::ResetBatchId();
+  SectionWorker::ResetThreadCompletedFlag();
 }
 Scope* PipelineTrainer::GetWorkerScope(int thread_id) {

--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -196,7 +196,6 @@ void SectionWorker::TrainFiles() {
        if (threads_completed) {
          VLOG(3) << "thread " << thread_id_ << " completed.";
          lk.unlock();
-          threads_completed = false;
          return;
        }
        lk.unlock();
@@ -459,7 +458,6 @@ void SectionWorker::TrainFilesWithProfiler() {
                    << ", mean_time: " << op_total_time[i] / op_count[i];
          }
          VLOG(0) << "================================";
-          threads_completed = false;
          return;
        }
        lk.unlock();

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -9,8 +9,8 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 function(download_data install_dir data_file)
-    string(REGEX MATCH "[^/\\]+$" data_file ${data_file})
+    string(REGEX MATCH "[^/\\]+$" file_name ${data_file})
-    if (NOT EXISTS ${install_dir}/${data_file})
+    if (NOT EXISTS ${install_dir}/${file_name})
        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file})
    endif()
 endfunction()

--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -54,9 +54,13 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
    float average_window = ctx.Attr<float>("average_window");
    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
+    PADDLE_ENFORCE_LE(
-                      "min_average_window shouldn't be larger than "
+        min_average_window, max_average_window,
-                      "max_average_window");
+        platform::errors::InvalidArgument(
+            "The min_average_window > "
+            "max_average_window is not right, min_average_window is %ld, "
+            "max_average_window is %ld.",
+            min_average_window, max_average_window));
    // Get inputs
    auto* param = ctx.Input<Tensor>("param");

--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -54,6 +54,8 @@ class ScopedRNNBase {
      x_descs_.emplace_back(x_desc_.descriptor<T>(dims_x, strides_x));
      y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y));
    }
+#if CUDNN_VERSION >= 7201
    if (!sequence_length.empty()) {
      x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true,
                                sequence_length);
@@ -61,6 +63,7 @@ class ScopedRNNBase {
                                hidden_size_ * numDirections, true,
                                sequence_length);
    }
+#endif
    // ------------------- cudnn hx, hy, cx, cy descriptors----------
    std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
@@ -96,10 +99,13 @@ class ScopedRNNBase {
        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
        cudnn_type));
 #endif
+#if CUDNN_VERSION >= 7201
    if (!sequence_length.empty()) {
      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
          rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
    }
+#endif
    // ------------------- cudnn weights_size ---------------------
    size_t weights_size_;
@@ -125,8 +131,10 @@ class ScopedRNNBase {
  }
  cudnnTensorDescriptor_t* x_descs() { return x_descs_.data(); }
  cudnnTensorDescriptor_t* y_descs() { return y_descs_.data(); }
+#if CUDNN_VERSION >= 7201
  cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_.desc(); }
  cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_.desc(); }
+#endif
  cudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
  cudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
  cudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
@@ -151,8 +159,10 @@ class ScopedRNNBase {
  platform::ScopedTensorDescriptor x_desc_;
  platform::ScopedTensorDescriptor y_desc_;
+#if CUDNN_VERSION >= 7201
  platform::ScopedRNNTensorDescriptor x_seq_desc_;
  platform::ScopedRNNTensorDescriptor y_seq_desc_;
+#endif
  platform::ScopedTensorDescriptor init_h_desc_;
  platform::ScopedTensorDescriptor init_c_desc_;
  platform::ScopedTensorDescriptor last_h_desc_;

--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
@@ -43,9 +43,9 @@ class FakeInitOp : public framework::OperatorBase {
      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
    } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::InvalidArgument(
          "fake init op's output only"
-          "supports SelectedRows and LoDTensor");
+          "supports SelectedRows and LoDTensor"));
    }
  }
 };

--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -134,7 +134,10 @@ void ListenAndServOp::RunSyncLoop(
  auto optimize_blocks =
      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 2. Recieved %zu",
+                        num_blocks));
  // Prepare all the server block
  std::vector<int> optimize_blocks_list;
@@ -218,7 +221,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
      VLOG(3) << "reset sparse var: " << varname;
      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
    } else {
-      PADDLE_THROW("The type of sparse var should be SelectedRows");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "The type of sparse var should be SelectedRows"));
    }
  }
  if (UNLIKELY(reset_all)) {
@@ -235,7 +239,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
        math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
                           static_cast<float>(0));
      } else {
-        PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]");
+        PADDLE_THROW(platform::errors::PreconditionNotMet(
+            "The type of dense var should be in [LoDTensor, Tensor]"));
      }
    }
  }
@@ -254,8 +259,15 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
    std::vector<std::string> pieces;
    split(grad_and_id, ':', &pieces);
    VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(pieces.size(), 2,
-    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);
+                      platform::errors::PreconditionNotMet(
+                          "Invalid format of grad_and_id argument. "
+                          "Expected \"grad:block_id\". Recieved %s",
+                          grad_and_id.c_str()));
+    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
+                      platform::errors::AlreadyExists(
+                          "The gradient name %s has already existed in out_map",
+                          pieces[0].c_str()));
    int block_id = std::stoi(pieces[1]);
    (*out_map)[pieces[0]] = block_id;
@@ -267,7 +279,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
  size_t num_blocks = program->Size();
  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 2. Recieved %zu",
+                        num_blocks));
  std::vector<int> block_list;
  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
    block_list.push_back(blkid);
@@ -342,9 +357,9 @@ void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
               var->IsType<framework::Tensor>()) {
      dense_vars_.push_back(varname);
    } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "The type of received var should be in [SelectedRows, LoDTensor, "
-          "Tensor].");
+          "Tensor]."));
    }
  }
 }
@@ -450,7 +465,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
    split(prefetch_var_name_and_id, ':', &pieces);
    VLOG(3) << "after split, prefetch_var = " << pieces[0]
            << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        pieces.size(), 2,
+        platform::errors::PreconditionNotMet(
+            "Invalid format of prefetch_var_name_and_id argument. "
+            "Expected \"xxx:xxx\". Recieved %s",
+            prefetch_var_name_and_id.c_str()));
    int block_id = std::stoi(pieces[1]);
    prefetch_block_id_list.push_back(block_id);
@@ -476,7 +496,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
       sparse_grad_name_to_param_name_str) {
    std::vector<std::string> pieces;
    split(sparse_grad_name_and_param_name, ':', &pieces);
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        pieces.size(), 2,
+        platform::errors::PreconditionNotMet(
+            "Invalid format of sparse_grad_name_and_param_name argument. "
+            "Expected \"xxx:xxx\". Recieved %s",
+            sparse_grad_name_and_param_name.c_str()));
    VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
            << ", param_name = " << pieces[1];
    sparse_grad_name_to_param_name[pieces[0]] = pieces[1];

--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -61,8 +61,15 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
                           const framework::Tensor *x,
                           const framework::Tensor *y, framework::Tensor *z) {
  int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+  auto x_dims = x->dims();
-      ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
+  }
 }
 template <typename DeviceContext, typename T>

--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -55,31 +55,38 @@ class EmptyOp : public framework::OperatorWithKernel {
    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
    if (context->HasInput("ShapeTensor")) {
-      auto dims = context->GetInputDim("ShapeTensor");
+      auto shape_dims = context->GetInputDim("ShapeTensor");
      int num_ele = 1;
-      for (int i = 0; i < dims.size(); ++i) {
+      for (int i = 0; i < shape_dims.size(); ++i) {
-        num_ele *= dims[i];
+        num_ele *= shape_dims[i];
      }
+      auto vec_dims = std::vector<int>(num_ele, -1);
-      context->SetOutputDim("Out", framework::make_ddim({num_ele}));
+      context->SetOutputDim("Out", framework::make_ddim(vec_dims));
    } else if (context->HasInputs("ShapeTensorList")) {
      std::vector<int> out_dims;
      auto dims_list = context->GetInputsDim("ShapeTensorList");
      for (size_t i = 0; i < dims_list.size(); ++i) {
        auto& dims = dims_list[i];
-        PADDLE_ENFORCE_EQ(
+        PADDLE_ENFORCE_EQ(dims, framework::make_ddim({1}),
-            dims, framework::make_ddim({1}),
+                          platform::errors::InvalidArgument(
-            "ShapeError: The shape of Tensor in list must be [1]. "
+                              "The shape of Tensor in list must be [1]. "
-            "But received the shape "
+                              "But received the shape is [%s]",
-            "is [%s]",
+                              dims));
-            dims);
+        out_dims.push_back(-1);
-        out_dims.push_back(dims[0]);
      }
      context->SetOutputDim("Out", framework::make_ddim(out_dims));
    } else {
      auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
+      for (size_t i = 0; i < shape.size(); ++i) {
+        PADDLE_ENFORCE_GE(
+            shape[i], 0,
+            platform::errors::InvalidArgument(
+                "Each value of attribute 'shape' is expected to be no less "
+                "than 0. But recieved: shape[%u] = %d; shape = [%s].",
+                i, shape[i], framework::make_ddim(shape)));
+      }
      context->SetOutputDim("Out", framework::make_ddim(shape));
    }
  }

--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -87,7 +87,10 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
    lod[0].assign(high_level.begin(), high_level.end());
    lod[1].assign(low_level.begin(), low_level.end());
    if (!framework::CheckLoD(lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "lod %s is not right in"
+          " beam_search, please check your code.",
+          framework::LoDToString(lod)));
    }
    selected_ids->set_lod(lod);
    selected_scores->set_lod(lod);

--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -400,7 +400,10 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
    context.Wait();
    if (!framework::CheckLoD(selected_lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "lod %s is not right in"
+          " beam_search, please check your code.",
+          framework::LoDToString(selected_lod)));
    }
    selected_ids->set_lod(selected_lod);

--- a/paddle/fluid/operators/math/blas.cc
+++ b/paddle/fluid/operators/math/blas.cc
@@ -20,7 +20,11 @@ namespace operators {
 namespace math {
 MatDescriptor CreateMatrixDescriptor(const framework::DDim &tensor_dim,
                                     int num_flatten_cols, bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
+  PADDLE_ENFORCE_GT(
+      tensor_dim.size(), 1,
+      platform::errors::InvalidArgument("The tensor dim size should be greater "
+                                        "than 1, but reveived dim size is %d",
+                                        tensor_dim.size()));
  MatDescriptor retv;
  if (num_flatten_cols > 1) {
    auto flatten_dim = framework::flatten_to_2d(tensor_dim, num_flatten_cols);

--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -60,7 +60,8 @@ struct CUBlas<float> {
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::cublasSgemmStridedBatched(args...));
 #else
-    PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "SgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
  }
@@ -85,7 +86,8 @@ struct CUBlas<float> {
          beta, C, Ctype, ldc));
    });
 #else
-    PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasSgemmEx is not supported on cuda <= 7.5"));
 #endif
  }
@@ -146,13 +148,15 @@ struct CUBlas<double> {
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::cublasDgemmStridedBatched(args...));
 #else
-    PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "DgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
  }
  template <typename... ARGS>
  static void GEMM_EX(ARGS... args) {
-    PADDLE_THROW("Currently there are not cublasDgemmEx.");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Currently there are not cublasDgemmEx."));
  }
  template <typename... ARGS>
@@ -216,7 +220,8 @@ struct CUBlas<platform::float16> {
        reinterpret_cast<const __half *>(beta), reinterpret_cast<__half *>(C),
        ldc, strideC, batchCount));
 #else
-    PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "HgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
  }
@@ -247,7 +252,8 @@ struct CUBlas<platform::float16> {
          beta, C, Ctype, ldc, computeType, algo));
    });
 #else
-    PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
  }
 };
@@ -302,8 +308,12 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
+  PADDLE_ENFORCE_GE(
-                    "cublas fp16 gemm requires GPU compute capability >= 53");
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
  float h_alpha = static_cast<float>(alpha);
  float h_beta = static_cast<float>(beta);

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -29,7 +29,8 @@ template <>
 struct CBlas<int8_t> {
  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
-    PADDLE_THROW("Blas VCOPY don't support int8_t");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Blas VCOPY do not supported on CPU, please check your code"));
  }
 };
@@ -347,22 +348,47 @@ struct CBlas<double> {
 template <>
 struct CBlas<platform::float16> {
-  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void GEMM(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 GEMM not supported on CPU, please check your code"));
+  }
  static void SMM_GEMM(...) {
-    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 SMM_GEMM not supported on CPU, please check your code"));
  }
-  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
+  static void VMUL(...) {
-  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
+    PADDLE_THROW(platform::errors::Unimplemented(
-  static void VSQUARE(...) {
+        "float16 VMUL not supported on CPU, please check your code"));
-    PADDLE_THROW("float16 VSQUARE not supported on CPU");
  }
-  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
+  static void VEXP(...) {
-  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
+    PADDLE_THROW(platform::errors::Unimplemented(
-  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
+        "float16 VEXP not supported on CPU, please check your code"));
-  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
+  }
+  static void VSQUARE(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VSQUARE not supported on CPU, please check your code"));
+  }
+  static void VPOW(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VPOW not supported on CPU, please check your code"));
+  }
+  static void DOT(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 DOT not supported on CPU, please check your code"));
+  };
+  static void SCAL(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 SCAL not supported on CPU, please check your code"));
+  };
+  static void ASUM(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 ASUM not supported on CPU, please check your code"));
+  };
 #ifdef PADDLE_WITH_MKLML
  static void GEMM_BATCH(...) {
-    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 GEMM_BATCH not supported on CPU, please check your code"));
  }
 #endif
 };
@@ -446,11 +472,18 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
  auto dim_a = mat_a.dims();
  auto dim_b = mat_b.dims();
  auto dim_out = mat_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+  PADDLE_ENFORCE_EQ(
-                 "The input and output of matmul be matrix");
+      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, true,
-  PADDLE_ENFORCE(
+      platform::errors::InvalidArgument(
-      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(),
+          "The input and output of matmul should be matrix, the dim size must "
-      "The places of matrices must be same");
+          "be 2,"
+          "but received dim size input_a:%d, input_b:%d, output:%d",
+          dim_a.size(), dim_b.size(), dim_out.size()));
+  PADDLE_ENFORCE_EQ(
+      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(), true,
+      platform::errors::InvalidArgument("The places of matrices in the matmul "
+                                        "should be same, please check your "
+                                        "code."));
  int M = dim_out[0];
  int N = dim_out[1];
@@ -715,7 +748,13 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMMWithHead(
    }
  } else {
-    PADDLE_ENFORCE_EQ(W1, H2);
+    PADDLE_ENFORCE_EQ(
+        W1, H2,
+        platform::errors::InvalidArgument(
+            "The fisrt matrix width should be same as second matrix height,"
+            "but received fisrt matrix width %d"
+            ", second matrix height %d",
+            W1, H2));
    int ldc = W2 * head_number;
    int sub_width = W1 / head_number;
@@ -785,7 +824,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                                 const framework::Tensor &mat_b,
                                 const MatDescriptor &dim_b, T alpha,
                                 framework::Tensor *mat_out, T beta) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+  PADDLE_ENFORCE_EQ(
+      dim_a.width_, dim_b.height_,
+      platform::errors::InvalidArgument(
+          "The fisrt matrix width should be same as second matrix height,"
+          "but received fisrt matrix width %d"
+          ", second matrix height %d",
+          dim_a.width_, dim_b.height_));
  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -793,12 +839,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                           dim_a.width_, alpha, mat_a.data<T>(),
                           mat_b.data<T>(), beta, mat_out->data<T>());
  } else {
-    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
+    PADDLE_ENFORCE_EQ(
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0,
+        dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
-                   "dim_a.batch_size should be equal to dim_b.batch_size, or "
+            dim_b.batch_size_ == 0,
-                   "one of dim_a.batch_size and dim_b.batch_size should be 0. "
+        true, platform::errors::InvalidArgument(
-                   "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
+                  "dim_a.batch_size should be equal to dim_b.batch_size, or "
-                   dim_a.batch_size_, dim_b.batch_size_);
+                  "one of dim_a.batch_size and dim_b.batch_size should be 0. "
+                  "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
+                  dim_a.batch_size_, dim_b.batch_size_));
    this->template BatchedGEMM<T>(
        transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
        mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
@@ -834,15 +882,42 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
                                         int head_number,
                                         framework::Tensor *mat_out, T beta,
                                         bool mat_b_split_vertical) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_ % head_number, 0);
+  PADDLE_ENFORCE_EQ(
-  PADDLE_ENFORCE_GE(head_number, 1);
+      dim_a.width_ % head_number, 0,
-  PADDLE_ENFORCE_LE(head_number, dim_a.width_);
+      platform::errors::InvalidArgument(
+          "The first input width must be some times the head number"
+          "but received first input width %d"
+          ",  head_number %d",
+          dim_a.width_, head_number));
+  PADDLE_ENFORCE_GE(head_number, 1,
+                    platform::errors::InvalidArgument(
+                        "The head number should be greater equal 1,"
+                        "but received head number %d",
+                        head_number));
+  PADDLE_ENFORCE_LE(
+      head_number, dim_a.width_,
+      platform::errors::InvalidArgument(
+          "The head number should be less equal first input width,"
+          "but received first input width %d"
+          ",  head_number %d",
+          dim_a.width_, head_number));
  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
  if (mat_b_split_vertical) {
-    PADDLE_ENFORCE_EQ(dim_b.height_, dim_a.width_ / head_number);
+    PADDLE_ENFORCE_EQ(
-    PADDLE_ENFORCE_EQ(dim_b.width_ % head_number, 0);
+        dim_b.height_, dim_a.width_ / head_number,
+        platform::errors::InvalidArgument(
+            "The second input height should be equal than first input width,"
+            "but received second input height %d, first input width %d",
+            dim_b.height_, dim_a.width_ / head_number));
+    PADDLE_ENFORCE_EQ(
+        dim_a.width_ % head_number, 0,
+        platform::errors::InvalidArgument(
+            "The second input width should be some times the head number"
+            "but received second input width %d"
+            ",  head_number %d",
+            dim_b.width_, head_number));
  }
  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -888,9 +963,16 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
                             mat_out->data<T>() + sub_matC_offset, ldc);
    }
  } else {
-    PADDLE_ENFORCE_EQ((dim_a.batch_size_ == dim_b.batch_size_ ||
+    PADDLE_ENFORCE_EQ(
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0),
+        (dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
-                      true);
+         dim_b.batch_size_ == 0),
+        true,
+        platform::errors::InvalidArgument(
+            "The first input batch size should be equal than second input,"
+            "either two input batch size is 0, but received first input batch "
+            "size"
+            " %d, second input batch size %d",
+            dim_a.batch_size_, dim_b.batch_size_));
    this->template BatchedGEMMWithHead<T>(
        transA, transB, dim_a.width_, dim_a.height_, dim_b.width_,

--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -22,10 +22,12 @@ limitations under the License. */
 #include <cblas.h>
 #endif
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/float16.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 namespace paddle {
 namespace operators {
@@ -63,6 +65,55 @@ DEFINE_CPU_TRANS(4);
 DEFINE_CPU_TRANS(5);
 DEFINE_CPU_TRANS(6);
+template <typename T>
+struct TransposeNormal<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    const T* in_ptr = in.data<T>();
+    T* out_ptr = out->data<T>();
+    auto transpose_helper = [&](int64_t beg, int64_t end) {
+      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
+        int64_t in_idx = 0;
+        int64_t tmp_idx = out_idx;
+        // calculate the input index
+        for (int i = 0; i < rank; ++i) {
+          const int64_t coordinate = tmp_idx / out_stride[i];
+          tmp_idx -= coordinate * out_stride[i];
+          in_idx += coordinate * in_stride[axis[i]];
+        }
+        out_ptr[out_idx] = in_ptr[in_idx];
+      }
+    };
+    double cost_per_iteration =
+        rank * (Eigen::TensorOpCost::DivCost<int64_t>() +
+                2 * Eigen::TensorOpCost::MulCost<int64_t>() +
+                2 * Eigen::TensorOpCost::AddCost<int64_t>());
+    Eigen::TensorOpCost cost(sizeof(T), sizeof(T), cost_per_iteration);
+    auto* cpu_device = context.eigen_pool_device();
+    cpu_device->parallelFor(out->numel(), cost, std::move(transpose_helper));
+  }
+};
+// define transpose normal
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
+DEFINE_CPU_TRANS_NORMAL(platform::float16);
+DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
+DEFINE_CPU_TRANS_NORMAL(float);
+DEFINE_CPU_TRANS_NORMAL(double);
+DEFINE_CPU_TRANS_NORMAL(int);
+DEFINE_CPU_TRANS_NORMAL(int64_t);
+DEFINE_CPU_TRANS_NORMAL(bool);
+DEFINE_CPU_TRANS_NORMAL(int16_t);
+DEFINE_CPU_TRANS_NORMAL(uint8_t);
+DEFINE_CPU_TRANS_NORMAL(int8_t);
 struct TensorSetConstantCPU {
  TensorSetConstantCPU(framework::Tensor* tensor, float value)
      : tensor_(tensor), value_(value) {}

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
@@ -23,6 +26,7 @@ namespace operators {
 namespace math {
 using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
 template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
@@ -31,12 +35,13 @@ template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
-#define DEFINE_GPU_TRANS(RANK)                                           \
+#define DEFINE_GPU_TRANS(RANK)                                            \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>;    \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;  \
  template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;
 DEFINE_GPU_TRANS(1);
@@ -46,6 +51,88 @@ DEFINE_GPU_TRANS(4);
 DEFINE_GPU_TRANS(5);
 DEFINE_GPU_TRANS(6);
+#define REINTERPRET(T, DST_PTR, SRC_PTR) \
+  T* DST_PTR = reinterpret_cast<T*>(SRC_PTR)
+template <typename T>
+__global__ void TransposeNormalKernel(const T* in_ptr, T* out_ptr,
+                                      int64_t element,
+                                      const int64_t* in_stride_ptr,
+                                      const int64_t* out_stride_ptr,
+                                      const int64_t* axis_ptr, int rank) {
+  CUDA_KERNEL_LOOP(out_idx, element) {
+    int64_t in_idx = 0;
+    int64_t tmp_idx = out_idx;
+    for (int i = 0; i < rank; ++i) {
+      const int64_t coordinate = tmp_idx / out_stride_ptr[i];
+      tmp_idx -= coordinate * out_stride_ptr[i];
+      in_idx += coordinate * in_stride_ptr[axis_ptr[i]];
+    }
+    out_ptr[out_idx] = in_ptr[in_idx];
+  }
+}
+template <typename T>
+struct TransposeNormal<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    auto* in_ptr = in.data<T>();
+    auto* out_ptr = out->data<T>();
+    // copy in_stride, out_stride, axis to gpu device
+    const platform::CUDAPlace& cuda_place =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+    platform::CPUPlace cpu_place = platform::CPUPlace();
+    size_t size = 3 * rank * sizeof(int64_t);
+    auto cpu_buf_holder = memory::AllocShared(cpu_place, size);
+    auto cuda_buf_holder = memory::AllocShared(cuda_place, size);
+    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
+    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
+    for (int i = 0; i < rank; ++i) {
+      cpu_buf[i] = in_stride[i];
+      cpu_buf[rank + i] = out_stride[i];
+      cpu_buf[2 * rank + i] = axis[i];
+    }
+    memory::Copy(cuda_place, cuda_buf, cpu_place, cpu_buf, size,
+                 context.stream());
+    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
+    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
+    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
+    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
+    const int MAX_GRID_DIM =
+        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int64_t elements = in.numel();
+    int block_size = (elements >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(elements)));
+    int grid_size = elements / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
+        in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr,
+        rank);
+  }
+};
+// define transpose normal
+#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CUDADeviceContext, TYPE>
+DEFINE_GPU_TRANS_NORMAL(float16);
+DEFINE_GPU_TRANS_NORMAL(bfloat16);
+DEFINE_GPU_TRANS_NORMAL(float);
+DEFINE_GPU_TRANS_NORMAL(double);
+DEFINE_GPU_TRANS_NORMAL(int);
+DEFINE_GPU_TRANS_NORMAL(int64_t);
+DEFINE_GPU_TRANS_NORMAL(bool);
+DEFINE_GPU_TRANS_NORMAL(int16_t);
+DEFINE_GPU_TRANS_NORMAL(uint8_t);
+DEFINE_GPU_TRANS_NORMAL(int8_t);
 struct TensorSetConstantGPU {
  TensorSetConstantGPU(const platform::DeviceContext& context,
                       framework::Tensor* tensor, float value)

--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -26,6 +26,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace math {
+template <typename DeviceContext, typename T>
+struct TransposeNormal {
+  // for dims >= 7 situation
+  void operator()(const DeviceContext& context, const framework::Tensor& in,
+                  framework::Tensor* out, const std::vector<int>& axis);
+};
 template <typename DeviceContext, typename T, int Rank>
 struct Transpose {
  void operator()(const DeviceContext& context, const framework::Tensor& in,

--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -18,9 +18,10 @@ limitations under the License. */
 #include <set>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 namespace paddle {
@@ -34,6 +35,110 @@ namespace operators {
  }
 using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims,
+                           const std::vector<int>& reduced_dims,
+                           std::vector<int>* perm_axis) {
+  // check if it's a reduced dim
+  std::vector<bool> src_dims_check(src_dims.size(), false);
+  size_t src_size = src_dims.size();
+  size_t reduce_size = reduced_dims.size();
+  for (size_t i = 0; i < reduce_size; ++i) {
+    dst_dims->at(src_size - reduce_size + i) = src_dims[reduced_dims[i]];
+    (*perm_axis)[src_size - reduce_size + i] = reduced_dims[i];
+    src_dims_check[reduced_dims[i]] = true;
+  }
+  size_t offset = 0;
+  for (size_t i = 0; i < src_dims_check.size(); ++i) {
+    bool is_reduced = src_dims_check[i];
+    if (!is_reduced) {
+      (*perm_axis)[offset] = i;
+      dst_dims->at(offset++) = src_dims[i];
+    }
+  }
+}
+template <typename DeviceContext, typename OutT>
+void GetShuffledInput(const framework::ExecutionContext& context,
+                      const Tensor* input, Tensor* shuffled_input,
+                      const std::vector<int>& dims) {
+  DDim shuffled_dims(input->dims());
+  std::vector<int> perm_axis(input->dims().size());
+  GetShuffledDim(input->dims(), &shuffled_dims, dims, &perm_axis);
+  shuffled_input->Resize(shuffled_dims);
+  shuffled_input->mutable_data<OutT>(context.GetPlace());
+  math::TransposeNormal<DeviceContext, OutT> trans;
+  trans(context.template device_context<DeviceContext>(), *input,
+        shuffled_input, perm_axis);
+}
+inline void GetOriginDimFromShuffled(const DDim& src_dim,
+                                     const std::vector<int>& dims,
+                                     std::vector<int>* origin_dim) {
+  DDim shuffled_dims(src_dim);
+  size_t n = src_dim.size();
+  std::vector<int> perm_axis(n);
+  GetShuffledDim(src_dim, &shuffled_dims, dims, &perm_axis);
+  for (size_t i = 0; i < n; ++i) {
+    (*origin_dim)[perm_axis[i]] = i;
+  }
+}
+template <typename DeviceContext, typename OutT, typename Functor>
+void HandleLargeDim(const framework::ExecutionContext& context,
+                    const Tensor* input, Tensor* output,
+                    const std::vector<int>& dims, bool keep_dim) {
+  //  shuffle the reduced dim to the end
+  Tensor shuffled_input;
+  GetShuffledInput<DeviceContext, OutT>(context, input, &shuffled_input, dims);
+  // transpose to 2D tensor whose shape is {unreduced, reduced}.
+  const int64_t unreduced = output->numel();
+  const int64_t reduced = shuffled_input.numel() / unreduced;
+  shuffled_input.Resize({unreduced, reduced});
+  DDim output_dim = output->dims();
+  output->Resize({unreduced});
+  ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
+      context.template device_context<DeviceContext>(), shuffled_input, output,
+      {1}, keep_dim);
+  output->Resize(output_dim);
+}
+template <typename DeviceContext, typename T, typename Functor>
+void HandleLargeDimGrad(const framework::ExecutionContext& context,
+                        const framework::Tensor* x,
+                        const framework::Tensor* out,
+                        const framework::Tensor* dout, framework::Tensor* dx,
+                        const std::vector<int>& dims) {
+  const int64_t unreduced = out->numel();
+  const int64_t reduced = x->numel() / unreduced;
+  DDim out_dim(out->dims());
+  DDim x_dim(x->dims());
+  // transpose and reshape X
+  Tensor shuffled_x;
+  GetShuffledInput<DeviceContext, T>(context, x, &shuffled_x, dims);
+  DDim shuffled_dim = shuffled_x.dims();
+  shuffled_x.Resize({unreduced, reduced});
+  // reshape dX {unreduced, reduced}
+  dx->Resize({unreduced, reduced});
+  ReduceGradFunctor<DeviceContext, T, 2, Functor>(
+      context.template device_context<DeviceContext>(), shuffled_x, *out, *dout,
+      dx, {1});
+  // transpose dX
+  std::vector<int> origin_axis(x_dim.size());
+  GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
+  Tensor dx_tmp;
+  framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
+  dx_tmp.Resize(shuffled_dim);
+  dx->Resize(x_dim);
+  math::TransposeNormal<DeviceContext, T> trans;
+  trans(context.template device_context<DeviceContext>(), dx_tmp, dx,
+        origin_axis);
+}
 template <typename DeviceContext, typename T, typename Functor>
 struct ReduceKernelFunctor {
@@ -69,22 +174,27 @@ struct ReduceKernelFunctor {
    } else {
      int ndim = input->dims().size();
      int rdim = dims.size();
-      HANDLE_DIM(6, 5);
+      if (ndim > 6) {
-      HANDLE_DIM(6, 4);
+        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
-      HANDLE_DIM(6, 3);
+                                                     dims, keep_dim);
-      HANDLE_DIM(6, 2);
+      } else {
-      HANDLE_DIM(6, 1);
+        HANDLE_DIM(6, 5);
-      HANDLE_DIM(5, 4);
+        HANDLE_DIM(6, 4);
-      HANDLE_DIM(5, 3);
+        HANDLE_DIM(6, 3);
-      HANDLE_DIM(5, 2);
+        HANDLE_DIM(6, 2);
-      HANDLE_DIM(5, 1);
+        HANDLE_DIM(6, 1);
-      HANDLE_DIM(4, 3);
+        HANDLE_DIM(5, 4);
-      HANDLE_DIM(4, 2);
+        HANDLE_DIM(5, 3);
-      HANDLE_DIM(4, 1);
+        HANDLE_DIM(5, 2);
-      HANDLE_DIM(3, 2);
+        HANDLE_DIM(5, 1);
-      HANDLE_DIM(3, 1);
+        HANDLE_DIM(4, 3);
-      HANDLE_DIM(2, 1);
+        HANDLE_DIM(4, 2);
-      HANDLE_DIM(1, 1);
+        HANDLE_DIM(4, 1);
+        HANDLE_DIM(3, 2);
+        HANDLE_DIM(3, 1);
+        HANDLE_DIM(2, 1);
+        HANDLE_DIM(1, 1);
+      }
    }
  }
 };
@@ -137,7 +247,6 @@ class ReduceKernel : public framework::OpKernel<T> {
    }
  }
 };
 template <typename DeviceContext, typename OutT, typename Functor>
 class BoolReduceKernel : public framework::OpKernel<OutT> {
 public:
@@ -175,22 +284,27 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
      int ndim = input->dims().size();
      int rdim = dims.size();
      // comments for accelerating compiling temporarily.
-      //      HANDLE_DIM(6, 5);
+      if (ndim > 6) {
-      //      HANDLE_DIM(6, 4);
+        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
-      //      HANDLE_DIM(6, 3);
+                                                     dims, keep_dim);
-      //      HANDLE_DIM(6, 2);
+      } else {
-      //      HANDLE_DIM(6, 1);
+        HANDLE_DIM(6, 5);
-      //      HANDLE_DIM(5, 4);
+        HANDLE_DIM(6, 4);
-      //      HANDLE_DIM(5, 3);
+        HANDLE_DIM(6, 3);
-      //      HANDLE_DIM(5, 2);
+        HANDLE_DIM(6, 2);
-      //      HANDLE_DIM(5, 1);
+        HANDLE_DIM(6, 1);
-      HANDLE_DIM(4, 3);
+        HANDLE_DIM(5, 4);
-      HANDLE_DIM(4, 2);
+        HANDLE_DIM(5, 3);
-      HANDLE_DIM(4, 1);
+        HANDLE_DIM(5, 2);
-      HANDLE_DIM(3, 2);
+        HANDLE_DIM(5, 1);
-      HANDLE_DIM(3, 1);
+        HANDLE_DIM(4, 3);
-      HANDLE_DIM(2, 1);
+        HANDLE_DIM(4, 2);
-      HANDLE_DIM(1, 1);
+        HANDLE_DIM(4, 1);
+        HANDLE_DIM(3, 2);
+        HANDLE_DIM(3, 1);
+        HANDLE_DIM(2, 1);
+        HANDLE_DIM(1, 1);
+      }
    }
  }
 };
@@ -279,6 +393,10 @@ class ReduceGradKernel : public framework::OpKernel<T> {
              context.template device_context<DeviceContext>(), *input0,
              *input1, *input2, output, dims);
          break;
+        default:
+          HandleLargeDimGrad<DeviceContext, T, Functor>(context, input0, input1,
+                                                        input2, output, dims);
+          break;
      }
    }
  }
@@ -313,12 +431,6 @@ class ReduceOp : public framework::OperatorWithKernel {
    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceOp");
    auto x_dims = ctx->GetInputDim("X");
    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimensions of ReduceOp "
-                          "should be less equal than 6. But received X's "
-                          "dimensions = %d, X's shape = [%s].",
-                          x_rank, x_dims));
    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
    PADDLE_ENFORCE_GT(dims.size(), 0,
                      platform::errors::InvalidArgument(
@@ -402,11 +514,6 @@ class ReduceGradOp : public framework::OperatorWithKernel {
                   "Out@GRAD", "ReduceOp");
    auto x_dims = ctx->GetInputDim("X");
    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6,
-                      platform::errors::InvalidArgument(
-                          "Tensors with rank at most 6 are supported by "
-                          "ReduceOp. Received tensor with rank %d.",
-                          x_rank));
    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
    for (size_t i = 0; i < dims.size(); ++i) {
      PADDLE_ENFORCE_LT(dims[i], x_rank,

--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -68,6 +68,6 @@ REGISTER_OPERATOR(
    shape, ops::ShapeOp, ops::ShapeOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int32_t>,
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
                       ops::ShapeKernel<double>);
--- a/paddle/fluid/operators/shape_op.cu
+++ b/paddle/fluid/operators/shape_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/shape_op.h"
 REGISTER_OP_CUDA_KERNEL(
-    shape, paddle::operators::ShapeKernel<int>,
+    shape, paddle::operators::ShapeKernel<bool>,
-    paddle::operators::ShapeKernel<int32_t>,
+    paddle::operators::ShapeKernel<int>,
    paddle::operators::ShapeKernel<int64_t>,
    paddle::operators::ShapeKernel<float>,
    paddle::operators::ShapeKernel<double>,

--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -53,10 +53,9 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
      trans6(dev_ctx, in, out, axis);
      break;
    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      // for dim >= 7 situation
-          "Tensors with rank at most 6 are supported"
+      math::TransposeNormal<DeviceContext, T> trans_normal;
-          ", but received input tensor's rank is %d,",
+      trans_normal(dev_ctx, in, out, axis);
-          dim));
  }
 }

--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -294,6 +294,7 @@ class ScopedTensorDescriptor {
  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
 };
+#if CUDNN_VERSION >= 7201
 class ScopedRNNTensorDescriptor {
 public:
  ScopedRNNTensorDescriptor() {
@@ -337,6 +338,7 @@ class ScopedRNNTensorDescriptor {
  cudnnRNNDataDescriptor_t desc_;
  DISABLE_COPY_AND_ASSIGN(ScopedRNNTensorDescriptor);
 };
+#endif
 class ScopedDropoutDescriptor {
 public:

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include <set>
 #include <string>
+#include <thread>  //NOLINT
 #include <unordered_set>
 #include <vector>
@@ -23,6 +24,7 @@ limitations under the License. */
 #endif
 #include "glog/logging.h"
+#include "unsupported/Eigen/CXX11/ThreadPool"
 namespace paddle {
 namespace memory {
@@ -131,16 +133,31 @@ DeviceContextPool::DeviceContextPool(
 CPUDeviceContext::CPUDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
+  InitPoolDevice();
 }
 CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
  eigen_device_.reset(new Eigen::DefaultDevice());
+  InitPoolDevice();
+}
+void CPUDeviceContext::InitPoolDevice() {
+  using EigenEnv = Eigen::StlThreadEnvironment;
+  using EigenThreadPool = Eigen::ThreadPoolTempl<EigenEnv>;
+  int num_threads = std::thread::hardware_concurrency();
+  eigen_threadpool_.reset(new EigenThreadPool(num_threads));
+  eigen_pool_device_.reset(
+      new Eigen::ThreadPoolDevice(eigen_threadpool_.get(), num_threads));
 }
 Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
  return eigen_device_.get();
 }
+Eigen::ThreadPoolDevice* CPUDeviceContext::eigen_pool_device() const {
+  return eigen_pool_device_.get();
+}
 Place CPUDeviceContext::GetPlace() const { return place_; }
 #ifdef PADDLE_WITH_XPU

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -41,6 +41,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #endif
+#define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
 #ifdef PADDLE_WITH_XPU
@@ -65,11 +66,17 @@ class CPUDeviceContext : public DeviceContext {
  Eigen::DefaultDevice* eigen_device() const;
+  Eigen::ThreadPoolDevice* eigen_pool_device() const;
  Place GetPlace() const override;
+  inline void InitPoolDevice();
 private:
  CPUPlace place_;
  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_pool_device_;
+  std::unique_ptr<Eigen::ThreadPool> eigen_threadpool_;
 };
 template <typename Place>

--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
+#endif
 #ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
 CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -101,9 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(cudnnDropoutGetStatesSize);                     \
  __macro(cudnnSetDropoutDescriptor);                     \
  __macro(cudnnRestoreDropoutDescriptor);                 \
-  __macro(cudnnCreateRNNDataDescriptor);                  \
-  __macro(cudnnDestroyRNNDataDescriptor);                 \
-  __macro(cudnnSetRNNDataDescriptor);                     \
  __macro(cudnnCreateRNNDescriptor);                      \
  __macro(cudnnGetRNNParamsSize);                         \
  __macro(cudnnGetRNNWorkspaceSize);                      \
@@ -112,11 +109,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(cudnnRNNBackwardData);                          \
  __macro(cudnnRNNBackwardWeights);                       \
  __macro(cudnnRNNForwardInference);                      \
-  __macro(cudnnRNNForwardTrainingEx);                     \
-  __macro(cudnnSetRNNPaddingMode);                        \
-  __macro(cudnnRNNBackwardDataEx);                        \
-  __macro(cudnnRNNBackwardWeightsEx);                     \
-  __macro(cudnnRNNForwardInferenceEx);                    \
  __macro(cudnnDestroyDropoutDescriptor);                 \
  __macro(cudnnDestroyRNNDescriptor);                     \
  __macro(cudnnSetTensorNdDescriptorEx);
@@ -188,6 +180,19 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
+#if CUDNN_VERSION >= 7201
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
+  __macro(cudnnCreateRNNDataDescriptor);             \
+  __macro(cudnnDestroyRNNDataDescriptor);            \
+  __macro(cudnnSetRNNDataDescriptor);                \
+  __macro(cudnnSetRNNPaddingMode);                   \
+  __macro(cudnnRNNForwardTrainingEx);                \
+  __macro(cudnnRNNBackwardDataEx);                   \
+  __macro(cudnnRNNBackwardWeightsEx);                \
+  __macro(cudnnRNNForwardInferenceEx);
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
 #if CUDNN_VERSION >= 7401
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                     \
  __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -621,6 +621,7 @@ function generate_upstream_develop_api_spec() {
    git checkout -b develop_base_pr upstream/$BRANCH
    cmake_gen $1
    build $2
+    cp ${PADDLE_ROOT}/python/requirements.txt /tmp
    git checkout $cur_branch
    generate_api_spec "$1" "DEV"
@@ -641,7 +642,12 @@ function generate_api_spec() {
    cd ${PADDLE_ROOT}/build/.check_api_workspace
    virtualenv .${spec_kind}_env
    source .${spec_kind}_env/bin/activate
-    pip install -r ${PADDLE_ROOT}/python/requirements.txt
+    if [ "$spec_kind" == "DEV" ]; then
+        pip install -r /tmp/requirements.txt
+    else
+        pip install -r ${PADDLE_ROOT}/python/requirements.txt
+    fi
    pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl
    spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
    python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
@@ -930,6 +936,10 @@ function parallel_test_base_gpu() {
 EOF
 set +x
+        precison_cases=""
+        if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
+            precision_cases=`python $PADDLE_ROOT/tools/get_pr_ut.py`
+        fi
        EXIT_CODE=0;
        test_cases=$(ctest -N -V) # get all test cases
        exclusive_tests=''        # cases list which would be run exclusively
@@ -959,10 +969,23 @@ set +x
                    echo $testcase" will only run at night."
                    continue
                fi
+                if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]]; then
+                    will_test="false"
+                    for case in $precision_cases; do
+                        if [[ $testcase == $case ]]; then
+                            will_test="true"
+                            break
+                        fi
+                    done
+                    if [[ $will_test == "false" ]]; then
+                        echo $testcase" won't run in PRECISION_TEST mode."
+                        continue
+                    fi
+                fi
                if [[ "$is_multicard" == "" ]]; then
                  # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
-                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist")
+                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
                fi
                if [[ "$is_exclusive" != "" ]]; then
@@ -1077,8 +1100,6 @@ set +x
                done
        fi
        if [[ "$EXIT_CODE" != "0" ]]; then
            if [[ "$failed_test_lists" == "" ]]; then
                echo "========================================"

--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -77,6 +77,7 @@ from .tensor.creation import triu  #DEFINE_ALIAS
 from .tensor.creation import tril  #DEFINE_ALIAS
 from .tensor.creation import meshgrid  #DEFINE_ALIAS
 from .tensor.creation import empty  #DEFINE_ALIAS
+from .tensor.creation import empty_like  #DEFINE_ALIAS
 from .tensor.linalg import matmul  #DEFINE_ALIAS
 from .tensor.linalg import dot  #DEFINE_ALIAS
 # from .tensor.linalg import einsum        #DEFINE_ALIAS

--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -608,25 +608,31 @@ class Fleet(object):
    @dygraph_only
    def distributed_model(self, model):
        """
-        Return dygraph distributed data parallel model (Layer)
+        Return distributed data parallel model (Only work in dygraph mode)
-        Only work in dygraph mode
+        Args:
+            model (Layer): the user-defind model which inherits Layer.
+        Returns:
+            distributed data parallel model which inherits Layer.
        Examples:
            .. code-block:: python
-            import paddle
-            import paddle.nn as nn
-            from paddle.distributed import fleet
-            class LinearNet(nn.Layer):
+                import paddle
-                def __init__(self):
+                import paddle.nn as nn
-                    super(LinearNet, self).__init__()
+                from paddle.distributed import fleet
-                    self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
-                def forward(self, x):
+                    def forward(self, x):
-                    return self._linear2(self._linear1(x))
+                        return self._linear2(self._linear1(x))
-            def train():
                # 1. enable dynamic mode
                paddle.disable_static()
@@ -658,8 +664,7 @@ class Fleet(object):
                adam.step()
                adam.clear_grad()
-            if __name__ == '__main__':
-                paddle.distributed.spawn(train)
        """
        assert model is not None
        self.model = paddle.DataParallel(model)
@@ -669,29 +674,30 @@ class Fleet(object):
    def state_dict(self):
        """
        Get state dict information from optimizer.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
        Returns: 
            state_dict(dict) : dict contains all the Tensor used by optimizer
        Examples:
            .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
-            paddle.disable_static()
+                import numpy as np
-            fleet.init(is_collective=True)
+                import paddle
+                from paddle.distributed import fleet
+                paddle.disable_static()
+                fleet.init(is_collective=True)
-            value = np.arange(26).reshape(2, 13).astype("float32")
+                value = np.arange(26).reshape(2, 13).astype("float32")
-            a = paddle.fluid.dygraph.to_variable(value)
+                a = paddle.fluid.dygraph.to_variable(value)
-            layer = paddle.nn.Linear(13, 5)
+                layer = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
-            adam = fleet.distributed_optimizer(adam)
+                adam = fleet.distributed_optimizer(adam)
-            dp_layer = fleet.distributed_model(layer)
+                dp_layer = fleet.distributed_model(layer)
-            state_dict = adam.state_dict()
+                state_dict = adam.state_dict()
        """
        # imitate target optimizer retrieval
        return self.user_defined_optimizer.state_dict()
@@ -700,34 +706,36 @@ class Fleet(object):
    def set_state_dict(self, state_dict):
        """
        Load optimizer state dict.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
        Args: 
            state_dict(dict) : Dict contains all the Tensor needed by optimizer
-        Returns: None 
+        Returns:
+            None
        Examples:
            .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
-            paddle.disable_static()
+                import numpy as np
-            fleet.init(is_collective=True)
+                import paddle
+                from paddle.distributed import fleet
+                paddle.disable_static()
+                fleet.init(is_collective=True)
-            value = np.arange(26).reshape(2, 13).astype("float32")
+                value = np.arange(26).reshape(2, 13).astype("float32")
-            a = paddle.fluid.dygraph.to_variable(value)
+                a = paddle.fluid.dygraph.to_variable(value)
-            layer = paddle.nn.Linear(13, 5)
+                layer = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
-            adam = fleet.distributed_optimizer(adam)
+                adam = fleet.distributed_optimizer(adam)
-            dp_layer = fleet.distributed_model(layer)
+                dp_layer = fleet.distributed_model(layer)
-            state_dict = adam.state_dict()
+                state_dict = adam.state_dict()
-            paddle.framework.save(state_dict, "paddle_dy")
+                paddle.framework.save(state_dict, "paddle_dy")
-            para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
+                para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
-            adam.set_state_dict(opti_state_dict)
+                adam.set_state_dict(opti_state_dict)
        """
        # imitate target optimizer retrieval
        return self.user_defined_optimizer.set_state_dict(state_dict)
@@ -736,42 +744,44 @@ class Fleet(object):
    def set_lr(self, value):
        """
        Set the value of the learning rate manually in the optimizer. 
-        Only work in dygraph mode
+        (Only work in dygraph mode)
        Args:
            value (float|Tensor): the value of learning rate
-        Returns: None 
+        Returns: 
+            None 
        Examples:
            .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
-            paddle.disable_static()
+                import numpy as np
-            fleet.init(is_collective=True)
+                import paddle
+                from paddle.distributed import fleet
-            value = np.arange(26).reshape(2, 13).astype("float32")
+                paddle.disable_static()
-            a = paddle.fluid.dygraph.to_variable(value)
+                fleet.init(is_collective=True)
-            layer = paddle.nn.Linear(13, 5)
+                value = np.arange(26).reshape(2, 13).astype("float32")
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                a = paddle.fluid.dygraph.to_variable(value)
-            adam = fleet.distributed_optimizer(adam)
+                layer = paddle.nn.Linear(13, 5)
-            dp_layer = fleet.distributed_model(layer)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
-            lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+                adam = fleet.distributed_optimizer(adam)
-            for i in range(5):
+                dp_layer = fleet.distributed_model(layer)
-                adam.set_lr(lr_list[i])
-                lr = adam.get_lr()
+                lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
-                print("current lr is {}".format(lr))
+                for i in range(5):
-            # Print:
+                    adam.set_lr(lr_list[i])
-            #    current lr is 0.2
+                    lr = adam.get_lr()
-            #    current lr is 0.3
+                    print("current lr is {}".format(lr))
-            #    current lr is 0.4
+                # Print:
-            #    current lr is 0.5
+                #    current lr is 0.2
-            #    current lr is 0.6
+                #    current lr is 0.3
+                #    current lr is 0.4
+                #    current lr is 0.5
+                #    current lr is 0.6
        """
        # imitate target optimizer retrieval
        return self.user_defined_optimizer.set_lr(value)
@@ -780,31 +790,32 @@ class Fleet(object):
    def get_lr(self):
        """
        Get current step learning rate.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
        Returns:
            float: The learning rate of the current step.
        Examples:
            .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
-            paddle.disable_static()
+                import numpy as np
-            fleet.init(is_collective=True)
+                import paddle
+                from paddle.distributed import fleet
+                paddle.disable_static()
+                fleet.init(is_collective=True)
-            value = np.arange(26).reshape(2, 13).astype("float32")
+                value = np.arange(26).reshape(2, 13).astype("float32")
-            a = paddle.fluid.dygraph.to_variable(value)
+                a = paddle.fluid.dygraph.to_variable(value)
-            layer = paddle.nn.Linear(13, 5)
+                layer = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
-            adam = fleet.distributed_optimizer(adam)
+                adam = fleet.distributed_optimizer(adam)
-            dp_layer = fleet.distributed_model(layer)
+                dp_layer = fleet.distributed_model(layer)
-            lr = adam.get_lr()
+                lr = adam.get_lr()
-            print(lr) # 0.01
+                print(lr) # 0.01
        """
        # imitate target optimizer retrieval
        return self.user_defined_optimizer.get_lr()
@@ -813,27 +824,27 @@ class Fleet(object):
    def step(self):
        """
        Execute the optimizer once.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
-        Returns: None
+        Returns:
+            None
        Examples:
            .. code-block:: python
-            import paddle
+                import paddle
-            import paddle.nn as nn
+                import paddle.nn as nn
-            from paddle.distributed import fleet
+                from paddle.distributed import fleet
-            class LinearNet(nn.Layer):
+                class LinearNet(nn.Layer):
-                def __init__(self):
+                    def __init__(self):
-                    super(LinearNet, self).__init__()
+                        super(LinearNet, self).__init__()
-                    self._linear1 = nn.Linear(10, 10)
+                        self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
+                        self._linear2 = nn.Linear(10, 1)
-                def forward(self, x):
+                    def forward(self, x):
-                    return self._linear2(self._linear1(x))
+                        return self._linear2(self._linear1(x))
-            def train():
                # 1. enable dynamic mode
                paddle.disable_static()
@@ -865,8 +876,6 @@ class Fleet(object):
                adam.step()
                adam.clear_grad()
-            if __name__ == '__main__':
-                paddle.distributed.spawn(train)
        """
        # imitate target optimizer retrieval
@@ -875,28 +884,28 @@ class Fleet(object):
    @dygraph_only
    def clear_grad(self):
        """
-        Execute the optimizer once.
+        Clear the gradients of all optimized parameters for model.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
-        Returns: None
+        Returns: 
+            None
        Examples:
            .. code-block:: python
-            import paddle
+                import paddle
-            import paddle.nn as nn
+                import paddle.nn as nn
-            from paddle.distributed import fleet
+                from paddle.distributed import fleet
-            class LinearNet(nn.Layer):
+                class LinearNet(nn.Layer):
-                def __init__(self):
+                    def __init__(self):
-                    super(LinearNet, self).__init__()
+                        super(LinearNet, self).__init__()
-                    self._linear1 = nn.Linear(10, 10)
+                        self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
+                        self._linear2 = nn.Linear(10, 1)
-                def forward(self, x):
+                    def forward(self, x):
-                    return self._linear2(self._linear1(x))
+                        return self._linear2(self._linear1(x))
-            def train():
                # 1. enable dynamic mode
                paddle.disable_static()
@@ -928,8 +937,6 @@ class Fleet(object):
                adam.step()
                adam.clear_grad()
-            if __name__ == '__main__':
-                paddle.distributed.spawn(train)
        """
        # imitate target optimizer retrieval
        return self.user_defined_optimizer.clear_grad()

--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -637,7 +637,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
        return "lo"
    def __start_kv_server(self, http_server_d, size_d):
-        from paddle.distributed.fleet.utils import KVServer
+        from paddle.distributed.fleet.utils.http_server import KVServer
        http_server = KVServer(int(self._http_ip_port[1]), size_d)
        http_server.start()
        wait_seconds = 5
@@ -651,6 +651,7 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
    def __init__(self, is_collective=False, init_gloo=False, **kwargs):
        super(UserDefinedRoleMaker, self).__init__(
            is_collective=is_collective, init_gloo=init_gloo, **kwargs)
+        self._init_gloo = init_gloo
    def _user_defined_ps_env(self):
        self._server_endpoints = self._kwargs.get("server_endpoints")

--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -16,20 +16,18 @@
 """basic collective operations in python"""
 """remote file system"""
-__all__ = ['UtilBase']
-import numpy as np
-import os
-import subprocess
-from paddle.fluid import core
-from collections import OrderedDict
-import paddle.fluid as fluid
-from google.protobuf import text_format
-from paddle.fluid import debugger
-from paddle.fluid.framework import Program
-from paddle.fluid.proto import framework_pb2
 from ..utils.fs import FS, LocalFS, HDFSClient
+from paddle.fluid.proto import framework_pb2
+from paddle.fluid.framework import Program
+from paddle.fluid import debugger
+from google.protobuf import text_format
+import paddle.fluid as fluid
+from collections import OrderedDict
+from paddle.fluid import core
+import subprocess
+import os
+import numpy as np
+__all__ = ['UtilBase']
 class UtilFactory(object):
@@ -53,7 +51,7 @@ class UtilBase(object):
    def _set_role_maker(self, role_maker):
        self.role_maker = role_maker
-    def set_file_system(self, fs_client):
+    def _set_file_system(self, fs_client):
        assert isinstance(
            fs_client, FS
        ), "fs_client must be the instance of paddle.distributed.fleet.utils.FS"
@@ -87,36 +85,183 @@ class UtilBase(object):
        return _comm_world
    def all_reduce(self, input, mode, comm_world="worker"):
+        """
+        All reduce `input` between specified collection. This is a distributed API.
+        Args:
+            input (list|numpy.array): The input variable to do all_reduce between specified collection.
+            mode (str): "sum" or "min" or "max".
+            comm_world (str, optional): Collection used to execute all_reduce operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+        Returns:
+            output(Numpy.array|None): A numpy array with the same shape as the `input` .
+        Examples:
+            .. code-block:: python
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+                import numpy as np
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+                    if fleet.is_server():
+                        input = [1, 2]
+                        output = fleet_util.all_reduce(input, "sum", "server")
+                        print(output)
+                        # [2, 4]
+                    elif fleet.is_worker():
+                        input = np.array([3, 4])
+                        output = fleet_util.all_reduce(input, "sum", "worker")
+                        print(output)
+                        # [6, 8]
+                    output = fleet_util.all_reduce(input, "sum", "all")
+                    print(output)
+                    # [8, 12]
+                if __name__ == "__main__":
+                    train()
+        """
        _comm_world = self.__check_comm_world(comm_world)
        return self.role_maker._all_reduce(_comm_world, input, mode)
    def barrier(self, comm_world="worker"):
+        """
+        Barrier between specified collection.
+        Args:
+            comm_world (str, optional): Collection used to execute barrier operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+        Examples:
+            .. code-block:: python
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+                    if fleet.is_server():
+                        fleet_util.barrier("server")
+                        print("all server arrive here")
+                    elif fleet.is_worker():
+                        fleet_util.barrier("worker")
+                        print("all server arrive here")
+                    fleet_util.barrier("all")
+                    print("all servers and workers arrive here")
+                if __name__ == "__main__":
+                    train()
+        """
        _comm_world = self.__check_comm_world(comm_world)
        self.role_maker._barrier(_comm_world)
    def all_gather(self, input, comm_world="worker"):
+        """
+        All gather `input` between specified collection.
+        Args:
+            input (Int|Float): The input variable to do all_gather between specified collection.
+            comm_world (str, optional): Collection used to execute all_reduce operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+        Returns:
+            output (List): A list of gathered values.
+        Examples:
+            .. code-block:: python
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+                    if fleet.is_server():
+                        input = fleet.server_index()
+                        output = fleet_util.all_gather(input, "server")
+                        print(output)
+                        # output = [0, 1]
+                    elif fleet.is_worker():
+                        input = fleet.worker_index()
+                        output = fleet_util.all_gather(input, "worker")
+                        # output = [0, 1]
+                        print(output)
+                    output = fleet_util.all_gather(input, "all")
+                    print(output)
+                    # output = [0, 1, 0, 1]
+                if __name__ == "__main__":
+                    train()
+        """
        _comm_world = self.__check_comm_world(comm_world)
        return self.role_maker._all_gather(_comm_world, input)
-    def broadcast(self):
+    def _broadcast(self):
        pass
-    def scatter(self):
+    def _scatter(self):
        pass
    def get_file_shard(self, files):
        """
-        split files before distributed training,
+        Split files before distributed training, and return filelist assigned to the current trainer.
-        example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
-                   0 gets [a, b, c] and trainer 1 gets [d, e].
+        .. code-block:: text
-        example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
-                   [a], trainer 1 gets [b],  trainer 2 gets []
+            example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
+                    0 gets [a, b, c] and trainer 1 gets [d, e].
+            example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
+                    [a], trainer 1 gets [b],  trainer 2 gets []
        Args:
-            files(list): file list need to be read.
+            files(list): File list need to be read.
        Returns:
-            list: files belongs to this worker.
+            List: Files belong to this worker.
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet.base.role_maker as role_maker
+                role = role_maker.UserDefinedRoleMaker(
+                    is_collective=False,
+                    init_gloo=False,
+                    current_id=0,
+                    role=role_maker.Role.WORKER,
+                    worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+                    server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+                fleet_util._set_role_maker(role)
+                files = fleet_util.get_file_shard(["file1", "file2", "file3"])
+                # files = ["file1", "file2"]
        """
        if not isinstance(files, list):
            raise TypeError("files should be a list of file need to be read.")
@@ -140,6 +285,30 @@ class UtilBase(object):
        return trainer_files[trainer_id]
    def print_on_rank(self, message, rank_id):
+        """
+        Woker of rank `rank_id` print some message. 
+        Args:
+            message(str): Log to be printed.
+            rank_id(int): trainer id.
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet.base.role_maker as role_maker
+                role = role_maker.UserDefinedRoleMaker(
+                    is_collective=False,
+                    init_gloo=False,
+                    current_id=0,
+                    role=role_maker.Role.WORKER,
+                    worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+                    server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+                fleet_util._set_role_maker(role)
+                fleet_util.print_on_rank("I'm worker 0", 0)
+        """
        if self.role_maker.worker_index() != rank_id:
            return
        print(message)
@@ -297,7 +466,7 @@ class UtilBase(object):
        with fluid.scope_guard(scope):
            inference_program, feed_target_names, fetch_targets = \
                fluid.io.load_inference_model(config.dump_model_dir, exe, model_filename=model_filename,
-                                            params_filename=config.save_params_filename)
+                                              params_filename=config.save_params_filename)
            # check program vars and saved vars shape
            orig_para_shape = {

--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -87,7 +87,7 @@ def _parse_args():
 see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
 ''')
-    #Optional arguments for the launch helper
+    # Optional arguments for the launch helper
    parser.add_argument(
        "--ips",
        type=str,
@@ -115,7 +115,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
        default="log",
        help="The path for each process's log.If it's not set, the log will printed to default pipe."
    )
-    #positional
+    # positional
    parser.add_argument(
        "training_script",
        type=str,
@@ -124,7 +124,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
        "followed by all the arguments for the "
        "training script")
-    #rest from the training program
+    # rest from the training program
    parser.add_argument('training_script_args', nargs=REMAINDER)
    return parser.parse_args()
@@ -138,7 +138,7 @@ def get_cluster_from_args(args, gpus):
    # node_ip = args.node_ip
    assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
-                % (node_ip, node_ips)
+        % (node_ip, node_ips)
    node_rank = node_ips.index(node_ip)
    logger.debug("parsed from args: node_ips:{} node_ip:{} node_rank:{}".format(
@@ -280,7 +280,7 @@ def launch_ps(args):
        _, current_node_ip = get_host_name_ip()
    assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
-                % (current_node_ip, node_ips)
+        % (current_node_ip, node_ips)
    node_rank = node_ips.index(current_node_ip)
    logger.debug(
        "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}".
@@ -323,10 +323,12 @@ def launch_ps(args):
    for idx, cur_server in enumerate(pod.servers):
        proc_env = {
            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
+            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
            "PADDLE_PORT": cur_server.endpoint.split(":")[1],
            "TRAINING_ROLE": "PSERVER",
            "PADDLE_TRAINERS_NUM": str(worker_num),
-            "POD_IP": cur_server.endpoint.split(":")[0]
+            "POD_IP": cur_server.endpoint.split(":")[0],
+            "PADDLE_WITH_GLOO": "1"
        }
        current_env.update(proc_env)
@@ -365,7 +367,8 @@ def launch_ps(args):
            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
            "PADDLE_TRAINERS_NUM": str(worker_num),
            "TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(cur_worker.rank)
+            "PADDLE_TRAINER_ID": str(cur_worker.rank),
+            "PADDLE_WITH_GLOO": "1"
        }
        current_env.update(proc_env)
@@ -430,7 +433,11 @@ def launch():
        co_arg for co_arg in collective_args
        if co_arg in " ".join(sys.argv[1:-1])
    ]
-    cuda_device_num = fluid.core.get_cuda_device_count()
+    if fluid.core.is_compiled_with_cuda():
+        cuda_device_num = fluid.core.get_cuda_device_count()
+    else:
+        cuda_device_num = 0
    if len(has_ps_args) > 0 or cuda_device_num == 0:
        logger.info(
            "Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".

--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -22,9 +22,13 @@ class AMPOptimizer(MetaOptimizerBase):
        self.amp_opt = None
        # we do not allow meta optimizer to be inner optimizer currently
        self.meta_optimizers_white_list = [
-            "LarsOptimizer", "LambOptimizer", "RecomputeOptimizer",
+            "LarsOptimizer",
-            "LocalSGDOptimizer", "GradientMergeOptimizer",
+            "LambOptimizer",
-            "GraphExecutionOptimizer", "AdaptiveLocalSGDOptimizer"
+            "RecomputeOptimizer",
+            "LocalSGDOptimizer",
+            "GradientMergeOptimizer",
+            "GraphExecutionOptimizer",
+            "AdaptiveLocalSGDOptimizer",
        ]
        self.meta_optimizers_black_list = ["DGCOptimizer"]

--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -26,7 +26,8 @@ class LocalSGDOptimizer(MetaOptimizerBase):
        self.inner_opt = optimizer
        self.meta_optimizers_white_list = []
        self.meta_optimizers_black_list = [
-            "GraphExecutionOptimizer", "AdaptiveLocalSGDOptimizer"
+            "GraphExecutionOptimizer",
+            "AdaptiveLocalSGDOptimizer",
        ]
        self.snapshot_key = '@SNAPSHOT'

--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -11,8 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .fs import *
-from .http_server import KVHandler, KVHTTPServer, KVServer
-#__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -32,10 +32,7 @@ import functools
 from pathlib import PurePosixPath, Path
 import shutil
-__all__ = [
+__all__ = ['LocalFS', 'HDFSClient']
-    'FS', 'LocalFS', 'HDFSClient', 'ExecuteError', 'FSTimeOut',
-    'FSFileExistsError', 'FSFileNotExistsError', 'FSShellCmdAborted'
-]
 class ExecuteError(Exception):
@@ -117,7 +114,37 @@ class FS(object):
 class LocalFS(FS):
+    """
+    A tool of local file system.
+    Examples:
+        .. code-block:: python
+            from paddle.distributed.fleet.utils.fs import LocalFS
+            client = LocalFS()
+            subdirs, files = client.ls_dir("./")
+    """
    def ls_dir(self, fs_path):
+        """	
+        List directorys and files under `fs_path` .
+        Args:
+            fs_path(str): The local file path.
+        Returns:
+            Tuple: Return a 2-tuple, the first is a list of all its subdirectories, 
+            and the second is a list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.utils.fs import LocalFS
+                client = LocalFS()
+                subdirs, files = client.ls_dir("./")
+        """
        if not self.is_exist(fs_path):
            return [], []
@@ -132,11 +159,46 @@ class LocalFS(FS):
        return dirs, files
    def mkdirs(self, fs_path):
+        """
+        Create a remote HDFS directory.
+        Args:
+            fs_path(str): The local directory path.
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.utils.fs import LocalFS
+                client = LocalFS()
+                client.mkdirs("test_mkdirs")
+                client.delete("test_mkdirs")
+        """
        assert not os.path.isfile(fs_path), "{} is already a file".format(
            fs_path)
        os.system("mkdir -p {}".format(fs_path))
    def rename(self, fs_src_path, fs_dst_path):
+        """
+        Rename the file.
+        Args:
+            fs_src_path(str): The actual name of the file or directory
+            fs_dst_path(str): The new name of the file or directory.
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.utils.fs import LocalFS
+                client = LocalFS()
+                client.touch("test_rename_src")
+                print(client.is_exists("test_rename_src")) # True
+                client.rename("test_rename_src", "test_rename_dst")
+                print(client.is_exists("test_rename_src")) # False
+                print(client.is_exists("test_rename_dst")) # True
+                client.delete("test_rename_dst")
+        """
        os.rename(fs_src_path, fs_dst_path)
    def _rmr(self, fs_path):
@@ -146,6 +208,21 @@ class LocalFS(FS):
        os.remove(fs_path)
    def delete(self, fs_path):
+        """
+        Delete the local file path, whether it's a file or directory.
+        Args:
+            fs_path(str): The local file path.
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.utils.fs import LocalFS
+                client = LocalFS()
+                client.mkdirs("test_localFS_mkdirs")
+                client.delete("test_localFS_mkdirs")
+        """
        if not self.is_exist(fs_path):
            return
@@ -158,15 +235,88 @@ class LocalFS(FS):
        return False
    def is_file(self, fs_path):
+        """
+        Whether the local file path is a file.
+        Args:
+            fs_path(str): The local file path.
+        Returns:
+            Bool: Return true if the path exists and it's a file, otherwise return false.
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.utils.fs import LocalFS
+                client = LocalFS()
+                client.touch("test_is_file")
+                print(client.is_file("test_is_file")) # True
+                client.delete("test_is_file")
+        """
        return os.path.isfile(fs_path)
    def is_dir(self, fs_path):
+        """
+        Whether the local file path is a directory.
+        Args:
+            fs_path(str): The local file path.
+        Returns:
+            Bool: Return true if the path exists and it's a directory, otherwise return false.
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.utils.fs import LocalFS
+                client = LocalFS()
+                client.mkdirs("test_is_dir")
+                print(client.is_dir("test_is_file")) # True
+                client.delete("test_is_dir")
+        """
        return os.path.isdir(fs_path)
    def is_exist(self, fs_path):
+        """
+        Whether the local file path exists.
+        Args:
+            fs_path(str): The local file path.
+        Returns:
+            Bool: Wheter it's a file or directory, return true if the path exists, 
+            otherwise return false.
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.utils.fs import LocalFS
+                client = LocalFS()
+                ret = local_fs.is_exist("test_is_exist")
+        """
        return os.path.exists(fs_path)
    def touch(self, fs_path, exist_ok=True):
+        """
+        Create a local file.
+        Args:
+            fs_path(str): The local file path.
+            exist_ok(bool): When `fs_path` exists, if `exist_ok` is set false,
+            program will throw an Exception. Default is true.
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.utils.fs import LocalFS
+                client = LocalFS()
+                client.touch("test_touch")
+                client.delete("test_touch")
+        """
        if self.is_exist(fs_path):
            if exist_ok:
                return
@@ -175,6 +325,26 @@ class LocalFS(FS):
        return Path(fs_path).touch(exist_ok=True)
    def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
+        """
+        Move a local file or directory from `src_path` to `dst_path` .
+        Args:
+            src_path(str):  Name of the file or directory, that's needed to be moved.
+            dst_path(str):  Name of the file or directory to which to move to.
+            overwrite(bool): Whether to re-write `dst_path` if that exists. Default is False.
+            test_exists(bool): Check the existence of `src_path` and `dst_path` . 
+            When `test_exists` is set true, if `src_path` doesn't exist or `dst_path` exists, program will throw an Excetption. 
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.utils.fs import LocalFS
+                client = LocalFS()
+                client.touch("test_mv_src")
+                client.mv("test_mv_src", "test_mv_dst")
+                client.delete("test_mv_dst")
+        """
        if not self.is_exist(src_path):
            raise FSFileNotExistsError
@@ -188,7 +358,21 @@ class LocalFS(FS):
    def list_dirs(self, fs_path):
        """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
+        Only list directorys under `fs_path` .
+        Args:
+            fs_path(str): The local file path.
+        Returns:
+            List: A list of all its subdirectories, e.g. [subdirname1, subdirname1, ...].
+        Examples:
+            .. code-block:: python
+                from paddle.distributed.fleet.utils.fs import LocalFS
+                client = LocalFS()
+                subdirs = client.list_dirs("./")
        """
        if not self.is_exist(fs_path):
            return []
@@ -217,7 +401,7 @@ def _handle_errors(max_time_out=None):
            while True:
                try:
                    return f(*args, **kwargs)
-                #important: only ExecuteError need to retry
+                # important: only ExecuteError need to retry
                except ExecuteError as e:
                    if time.time() - start >= time_out:
                        raise FSTimeOut("args:{} timeout:{}".format(
@@ -236,12 +420,36 @@ def _handle_errors(max_time_out=None):
 class HDFSClient(FS):
+    """
+    A tool of HDFS.
+    Args:
+        hadoop_home(str): Hadoop home. 
+        configs(dict): Hadoop config. It is a dictionary and needs to contain the
+            keys: "fs.default.name" and "hadoop.job.ugi".
+    Examples:
+        .. code-block:: text
+            from paddle.distributed.fleet.utils.fs import HDFSClient
+            hadoop_home = "/home/client/hadoop-client/hadoop/"
+            configs = {
+                "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                "hadoop.job.ugi": "hello,hello123"
+            }
+            client = HDFSClient(hadoop_home, configs)
+            client.ls_dir("hdfs:/test_hdfs_client")
+    """
    def __init__(
            self,
            hadoop_home,
            configs,
-            time_out=5 * 60 * 1000,  #ms
+            time_out=5 * 60 * 1000,  # ms
-            sleep_inter=1000):  #ms
+            sleep_inter=1000):  # ms
        # Raise exception if JAVA_HOME not exists.
        java_home = os.environ["JAVA_HOME"]
@@ -272,6 +480,30 @@ class HDFSClient(FS):
    @_handle_errors()
    def list_dirs(self, fs_path):
+        """	
+        Only list directorys under `fs_path` .
+        Args:
+            fs_path(str): The HDFS file path.
+        Returns:
+            List: A list of all its subdirectories, e.g. [subdirname1, subdirname1, ...].
+        Examples:
+            .. code-block:: text
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+                client = HDFSClient(hadoop_home, configs)
+                subdirs = client.list_dirs("hdfs:/test_hdfs_client")
+        """
        if not self.is_exist(fs_path):
            return []
@@ -281,7 +513,29 @@ class HDFSClient(FS):
    @_handle_errors()
    def ls_dir(self, fs_path):
        """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
+        List directorys and files under `fs_path` .
+        Args:
+            fs_path(str): The HDFS file path.
+        Returns:
+            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories, 
+            and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
+        Examples:
+            .. code-block:: text
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+                client = HDFSClient(hadoop_home, configs)
+                subdirs, files = client.ls_dir("hdfs:/test_hdfs_client")
        """
        if not self.is_exist(fs_path):
            return [], []
@@ -320,6 +574,30 @@ class HDFSClient(FS):
    @_handle_errors()
    def is_dir(self, fs_path):
+        """
+        Whether the remote HDFS path is a directory.
+        Args:
+            fs_path(str): The HDFS file path.
+        Returns:
+            Bool: Return true if the path exists and it's a directory, otherwise return false.
+        Examples:
+            .. code-block:: text
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_file("hdfs:/test_hdfs_client")
+        """
        if not self.is_exist(fs_path):
            return False
@@ -338,6 +616,30 @@ class HDFSClient(FS):
        return True
    def is_file(self, fs_path):
+        """
+        Whether the remote HDFS path is a file.
+        Args:
+            fs_path(str): The HDFS file path.
+        Returns:
+            Bool: Return true if the path exists and it's a file, otherwise return false.
+        Examples:
+            .. code-block:: text
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_file("hdfs:/test_hdfs_client")
+        """
        if not self.is_exist(fs_path):
            return False
@@ -345,6 +647,31 @@ class HDFSClient(FS):
    @_handle_errors()
    def is_exist(self, fs_path):
+        """
+        Whether the remote HDFS path exists.
+        Args:
+            fs_path(str): The hdfs file path.
+        Returns:
+            Bool: Whether it's is file or directory, return true if the path exists,
+            otherwise return false.
+        Examples:
+            .. code-block:: text
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_exist("hdfs:/test_hdfs_client")
+        """
        cmd = "ls {} ".format(fs_path)
        ret, out = self._run_cmd(cmd, redirect_stderr=True)
        if ret != 0:
@@ -357,6 +684,28 @@ class HDFSClient(FS):
    # can't retry
    def upload(self, local_path, fs_path):
+        """
+        Upload the local path to remote HDFS.
+        Args:
+            local_path(str): The local path.
+            fs_path(str): The HDFS path.
+        Examples:
+            .. code-block:: text
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+                client = HDFSClient(hadoop_home, configs)
+                client.upload("test_hdfs_client", "hdfs:/test_hdfs_client")
+        """
        if self.is_exist(fs_path):
            raise FSFileExistsError("{} exists".format(fs_path))
@@ -380,6 +729,28 @@ class HDFSClient(FS):
    # can't retry
    def download(self, fs_path, local_path):
+        """
+        Download remote HDFS path to the local.
+        Args:
+            fs_path(str):  The HDFS path.
+            local_path(str): The local path.
+        Examples:
+            .. code-block:: text
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+                client = HDFSClient(hadoop_home, configs)
+                client.download("hdfs:/test_hdfs_client", "./")
+        """
        if self.is_exist(local_path):
            raise FSFileExistsError("{} exists".format(local_path))
@@ -403,6 +774,27 @@ class HDFSClient(FS):
    @_handle_errors()
    def mkdirs(self, fs_path):
+        """
+        Create a remote HDFS directory.
+        Args:
+            fs_path(str): The HDFS directory path.
+        Examples:
+            .. code-block:: text
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+                client = HDFSClient(hadoop_home, configs)
+                client.mkdirs("hdfs:/test_hdfs_client")
+        """
        if self.is_exist(fs_path):
            return
@@ -425,6 +817,30 @@ class HDFSClient(FS):
                raise ExecuteError(cmd)
    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
+        """
+        Move a remote HDFS file or directory from `fs_src_path` to `fs_dst_path` .
+        Args:
+            fs_src_path(str):  Name of the file or directory, that's needed to be moved.
+            fs_dst_path(str):  Name of the file or directory to which to move to.
+            overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
+            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption. 
+        Examples:
+            .. code-block:: text
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+                client = HDFSClient(hadoop_home, configs)
+                client.mv("hdfs:/test_hdfs_client", "hdfs:/test_hdfs_client2")
+        """
        if overwrite and self.is_exist(fs_dst_path):
            self.delete(fs_dst_path)
@@ -467,6 +883,27 @@ class HDFSClient(FS):
    @_handle_errors()
    def delete(self, fs_path):
+        """
+        Delete a remote HDFS path, whether it's a file or directory.
+        Args:
+            fs_path(str): The HDFS file path.
+        Examples:
+            .. code-block:: text
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+                client = HDFSClient(hadoop_home, configs)
+                client.delete("hdfs:/test_hdfs_client")
+        """
        if not self.is_exist(fs_path):
            return
@@ -477,6 +914,27 @@ class HDFSClient(FS):
        return self._rm(fs_path)
    def touch(self, fs_path, exist_ok=True):
+        """
+        Create a remote HDFS file.
+        Args:
+            fs_path(str): The HDFS file path.
+        Examples:
+            .. code-block:: text
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+                client = HDFSClient(hadoop_home, configs)
+                client.touch("hdfs:/test_hdfs_client")
+        """
        if self.is_exist(fs_path):
            if exist_ok:
                return

--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -67,6 +67,7 @@ class ImperativeQuantAware(object):
        Examples:
        .. code-block:: python
+            import paddle
            from paddle.fluid.contrib.slim.quantization \
                import ImperativeQuantAware
            from paddle.vision.models \
@@ -86,13 +87,12 @@ class ImperativeQuantAware(object):
            # ...
            # Save quant model for the inference.
-            imperative_qat.save_quantized_model(
+            paddle.jit.save(
-                dirname="./resnet50_qat",
+                layer=model,
-                model=model,
+                model_path="./resnet50_qat",
-                input_shape=[(3, 224, 224)],
+                input_spec=[
-                input_dtype=['float32'],
+                    paddle.static.InputSpec(
-                feed=[0],
+                    shape=[None, 3, 224, 224], dtype='float32')])
-                fetch=[0])
        """
        super(ImperativeQuantAware, self).__init__()
        self._weight_bits = weight_bits
@@ -148,75 +148,6 @@ class ImperativeQuantAware(object):
            quant_layer = self._get_quantized_counterpart(layer)
            setattr(obj, target, quant_layer)
-    def save_quantized_model(self,
-                             dirname,
-                             model,
-                             input_shape,
-                             input_dtype,
-                             feed,
-                             fetch,
-                             append_batch_size=True):
-        """
-        Save the quantized model for the inference.
-        Args:
-            dirname (str): the directory to save the quantized model.
-            model(fluid.dygraph.Layer): the quantized model to be saved.
-            input_shape(list[tuple(int)]): The shape value for each input,
-                e.g. [(3, 224, 224)].
-            input_dtype(list[str]): The dtype value for each input,
-                e.g. ['float32'].
-            feed(list[int]): the indices of the input variables of the
-                imperative functions which will be saved as input variables in
-                inference model.
-            fetch(list[int]): the indices of the returned variable of the
-                imperative functions which will be saved as output variables in
-                inference model.
-            append_batch_size(bool, optional):
-                If true, it prepends an extra axis to the input_shape, meanwhile,
-                the input_shape shouldn't contain the batch size dimension.
-                Otherwise, it just uses the input_shape. Default True.
-        Returns:
-            None
-        """
-        assert isinstance(
-            input_shape, list), "The parameter `input_shape` shoubld be a list."
-        assert isinstance(
-            input_dtype, list), "The parameter `input_dtype` shoubld be a list."
-        assert isinstance(feed, list), "The parameter `feed` shoubld be a list."
-        assert isinstance(fetch,
-                          list), "The parameter `fetch` shoubld be a list."
-        assert len(input_shape) == len(
-            input_dtype
-        ), "The length of input_shape should be equal to  input_dtype's."
-        assert len(input_dtype) == len(
-            feed), "The length of input_shape should be equal to  feed's."
-        with dygraph.guard():
-            model.eval()
-            input_vars = []
-            for i, (shape, dtype) in enumerate(zip(input_shape, input_dtype)):
-                if append_batch_size:
-                    shape = [None] + list(shape)
-                # Note(Aurelius84): need a elegant way to name this.
-                in_spec = paddle.static.InputSpec(shape, dtype, 'feed_%d' % i)
-                input_vars.append(in_spec)
-            # use `declarative` to convert dygraph into static program
-            model.forward = dygraph.jit.declarative(
-                model.forward, input_spec=input_vars)
-            outputs = model.forward.concrete_program.outputs
-        input_spec = [input_vars[i] for i in feed]
-        configs = dygraph.jit.SaveLoadConfig()
-        configs.separate_params = True
-        if not isinstance(outputs, (tuple, list)):
-            outputs = [outputs]
-        configs.output_spec = [outputs[i] for i in fetch]
-        dygraph.jit.save(
-            layer=model,
-            model_path=dirname,
-            input_spec=input_spec,
-            configs=configs)
    def _get_quantized_counterpart(self, layer):
        quant_layers = tuple(self._quant_layers_map.values())
        quantized_counterpart = tuple('Quantized' + k

--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -221,7 +221,7 @@ class TestImperativeQat(unittest.TestCase):
            model_dict = lenet.state_dict()
            fluid.save_dygraph(model_dict, "save_temp")
-            # test the correctness of `save_quantized_model`
+            # test the correctness of `paddle.jit.save`
            data = next(test_reader())
            test_data = np.array([x[0].reshape(1, 28, 28)
                                  for x in data]).astype('float32')
@@ -231,13 +231,14 @@ class TestImperativeQat(unittest.TestCase):
        # save inference quantized model
        path = "./mnist_infer_model"
-        imperative_qat.save_quantized_model(
+        paddle.jit.save(
-            dirname=path,
+            layer=lenet,
-            model=lenet,
+            model_path=path,
-            input_shape=[(1, 28, 28)],
+            input_spec=[
-            input_dtype=['float32'],
+                paddle.static.InputSpec(
-            feed=[0],
+                    shape=[None, 1, 28, 28], dtype='float32')
-            fetch=[0])
+            ])
        if core.is_compiled_with_cuda():
            place = core.CUDAPlace(0)
        else:
@@ -245,7 +246,10 @@ class TestImperativeQat(unittest.TestCase):
        exe = fluid.Executor(place)
        [inference_program, feed_target_names, fetch_targets] = (
            fluid.io.load_inference_model(
-                dirname=path, executor=exe))
+                dirname=path,
+                executor=exe,
+                model_filename="__model__",
+                params_filename="__variables__"))
        after_save, = exe.run(inference_program,
                              feed={feed_target_names[0]: test_data},
                              fetch_list=fetch_targets)
@@ -332,13 +336,13 @@ class TestImperativeQat(unittest.TestCase):
                if batch_id % 100 == 0:
                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
-        imperative_qat.save_quantized_model(
+        paddle.jit.save(
-            dirname="./dynamic_mnist",
+            layer=lenet,
-            model=lenet,
+            model_path="./dynamic_mnist",
-            input_shape=[(1, 28, 28)],
+            input_spec=[
-            input_dtype=['float32'],
+                paddle.static.InputSpec(
-            feed=[0],
+                    shape=[None, 1, 28, 28], dtype='float32')
-            fetch=[0])
+            ])
        # static graph train
        _logger.info(

--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -39,6 +39,11 @@ try:
        third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
        os.environ['path'] = third_lib_path + ';' + os.environ['path']
        sys.path.insert(0, third_lib_path)
+        # Note: from python3.8, PATH will not take effect
+        # https://github.com/python/cpython/pull/12302
+        # Use add_dll_directory to specify dll resolution path
+        if sys.version_info[:2] >= (3, 8):
+            os.add_dll_directory(third_lib_path)
 except ImportError as e:
    from .. import compat as cpt

--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -60,7 +60,7 @@ class DygraphToStaticAst(gast.NodeTransformer):
    def transfer_from_node_type(self, node_wrapper):
        translator_logger = logging_utils.TranslatorLogger()
        translator_logger.log(
-            1, "   Source code: \n{}".format(ast_to_source_code(self.root)))
+            1, "Source code: \n{}".format(ast_to_source_code(self.root)))
        # Generic transformation
        self.visit(node_wrapper.node)

--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -12,17 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
 import six
 import inspect
 import numpy as np
 import collections
 import paddle
 from paddle.fluid import core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
 from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
@@ -291,7 +292,7 @@ def convert_to_input_spec(inputs, input_spec):
        if len(inputs) > len(input_spec):
            for rest_input in inputs[len(input_spec):]:
                if isinstance(rest_input, (core.VarBase, np.ndarray)):
-                    logging.warning(
+                    logging_utils.warn(
                        "The inputs constain `{}` without specificing InputSpec, its shape and dtype will be treated immutable. "
                        "Please specific InputSpec information in `@declarative` if you expect them as mutable inputs.".
                        format(type_name(rest_input)))

--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -26,6 +26,8 @@ CODE_LEVEL_ENV_NAME = 'TRANSLATOR_CODE_LEVEL'
 DEFAULT_VERBOSITY = -1
 DEFAULT_CODE_LEVEL = -1
+LOG_AllTransformer = 100
 def synchronized(func):
    def wrapper(*args, **kwargs):
@@ -53,10 +55,15 @@ class TranslatorLogger(object):
            return
        self._initialized = True
+        self.logger_name = "Dynamic-to-Static"
        self._logger = log_helper.get_logger(
-            __name__, 1, fmt='%(asctime)s-%(levelname)s: %(message)s')
+            self.logger_name,
+            1,
+            fmt='%(asctime)s %(name)s %(levelname)s: %(message)s')
        self._verbosity_level = None
        self._transformed_code_level = None
+        self._need_to_echo_log_to_stdout = None
+        self._need_to_echo_code_to_stdout = None
    @property
    def logger(self):
@@ -86,6 +93,28 @@ class TranslatorLogger(object):
        self.check_level(level)
        self._transformed_code_level = level
+    @property
+    def need_to_echo_log_to_stdout(self):
+        if self._need_to_echo_log_to_stdout is not None:
+            return self._need_to_echo_log_to_stdout
+        return False
+    @need_to_echo_log_to_stdout.setter
+    def need_to_echo_log_to_stdout(self, log_to_stdout):
+        assert isinstance(log_to_stdout, (bool, type(None)))
+        self._need_to_echo_log_to_stdout = log_to_stdout
+    @property
+    def need_to_echo_code_to_stdout(self):
+        if self._need_to_echo_code_to_stdout is not None:
+            return self._need_to_echo_code_to_stdout
+        return False
+    @need_to_echo_code_to_stdout.setter
+    def need_to_echo_code_to_stdout(self, code_to_stdout):
+        assert isinstance(code_to_stdout, (bool, type(None)))
+        self._need_to_echo_code_to_stdout = code_to_stdout
    def check_level(self, level):
        if isinstance(level, (six.integer_types, type(None))):
            rv = level
@@ -110,34 +139,56 @@ class TranslatorLogger(object):
    def error(self, msg, *args, **kwargs):
        self.logger.error(msg, *args, **kwargs)
+        if self.need_to_echo_log_to_stdout:
+            self._output_to_stdout('ERROR: ' + msg, *args)
    def warn(self, msg, *args, **kwargs):
-        self.logger.warn(msg, *args, **kwargs)
+        self.logger.warning(msg, *args, **kwargs)
+        if self.need_to_echo_log_to_stdout:
+            self._output_to_stdout('WARNING: ' + msg, *args)
    def log(self, level, msg, *args, **kwargs):
        if self.has_verbosity(level):
-            self.logger.log(level, msg, *args, **kwargs)
+            msg_with_level = '(Level {}) {}'.format(level, msg)
+            self.logger.info(msg_with_level, *args, **kwargs)
+            if self.need_to_echo_log_to_stdout:
+                self._output_to_stdout('INFO: ' + msg_with_level, *args)
    def log_transformed_code(self, level, ast_node, transformer_name, *args,
                             **kwargs):
        if self.has_code_level(level):
            source_code = ast_to_source_code(ast_node)
-            header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
+            if level == LOG_AllTransformer:
-                .format(level, transformer_name)
+                header_msg = "After the last level ast transformer: '{}', the transformed code:\n" \
+                    .format(transformer_name)
+            else:
+                header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
+                    .format(level, transformer_name)
            msg = header_msg + source_code
            self.logger.info(msg, *args, **kwargs)
+            if self.need_to_echo_code_to_stdout:
+                self._output_to_stdout('INFO: ' + msg, *args)
+    def _output_to_stdout(self, msg, *args):
+        msg = self.logger_name + ' ' + msg
+        print(msg % args)
 _TRANSLATOR_LOGGER = TranslatorLogger()
-def set_verbosity(level=0):
+def set_verbosity(level=0, also_to_stdout=False):
    """
-    Sets the verbosity level of log for dygraph to static graph.
+    Sets the verbosity level of log for dygraph to static graph. Logs can be output to stdout by setting `also_to_stdout`.
    There are two means to set the logging verbosity:
-     1. Call function `set_verbosity`
-     2. Set environment variable `TRANSLATOR_VERBOSITY`
+    1. Call function `set_verbosity`
+    2. Set environment variable `TRANSLATOR_VERBOSITY`
    **Note**:
    `set_verbosity` has a higher priority than the environment variable.
@@ -145,6 +196,7 @@ def set_verbosity(level=0):
    Args:
        level(int): The verbosity level. The larger value idicates more verbosity.
            The default value is 0, which means no logging.
+        also_to_stdout(bool): Whether to also output log messages to `sys.stdout`.
    Examples:
        .. code-block:: python
@@ -159,27 +211,30 @@ def set_verbosity(level=0):
            # The verbosity level is now 3, but it has no effect because it has a lower priority than `set_verbosity`
    """
    _TRANSLATOR_LOGGER.verbosity_level = level
+    _TRANSLATOR_LOGGER.need_to_echo_log_to_stdout = also_to_stdout
 def get_verbosity():
    return _TRANSLATOR_LOGGER.verbosity_level
-LOG_AllTransformer = 100
+def set_code_level(level=LOG_AllTransformer, also_to_stdout=False):
-def set_code_level(level=LOG_AllTransformer):
    """
-    Sets the level to print code from specific level of Ast Transformer.
+    Sets the level to print code from specific level Ast Transformer. Code can be output to stdout by setting `also_to_stdout`.
    There are two means to set the code level:
-     1. Call function `set_code_level`
-     2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+    1. Call function `set_code_level`
+    2. Set environment variable `TRANSLATOR_CODE_LEVEL`
    **Note**:
    `set_code_level` has a higher priority than the environment variable.
    Args:
        level(int): The level to print code. Default is 100, which means to print the code after all AST Transformers.
+        also_to_stdout(bool): Whether to also output code to `sys.stdout`.
    Examples:
        .. code-block:: python
@@ -195,6 +250,7 @@ def set_code_level(level=LOG_AllTransformer):
    """
    _TRANSLATOR_LOGGER.transformed_code_level = level
+    _TRANSLATOR_LOGGER.need_to_echo_code_to_stdout = also_to_stdout
 def get_code_level():

--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -14,21 +14,17 @@
 from __future__ import print_function
 import numpy as np
-import logging
 import six
-from paddle.fluid import log_helper
 from paddle.fluid import framework, backward, core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
 import paddle.compat as cpt
-_logger = log_helper.get_logger(
-    __name__, logging.WARNING, fmt='%(asctime)s-%(levelname)s: %(message)s')
 class NestSequence(object):
    """
@@ -72,7 +68,7 @@ class NestSequence(object):
                if not isinstance(var, (framework.Variable, core.VarBase)):
                    warning_types.add(type(var))
            if warning_types:
-                _logger.warning(
+                logging_utils.warn(
                    "Output of traced function contains non-tensor type values: {}. "
                    "Currently, We don't support to update them while training and will return "
                    "what we first saw. Please try to return them as tensor.".

--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -15,14 +15,8 @@
 from __future__ import print_function
 import gast
-import logging
-from paddle.fluid import log_helper
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, StaticAnalysisVisitor
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, NodeVarType, StaticAnalysisVisitor
-from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
-_logger = log_helper.get_logger(
-    __name__, logging.WARNING, fmt='%(asctime)s-%(levelname)s: %(message)s')
 class PrintTransformer(gast.NodeTransformer):

--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -13,17 +13,15 @@
 # limitations under the License.
 from __future__ import print_function
-import gast
 import collections
-import logging
+import gast
 import inspect
 import six
 import textwrap
 import threading
-import warnings
 import weakref
-import gast
 from paddle.fluid import framework
 from paddle.fluid import in_dygraph_mode
 from paddle.fluid.dygraph import layers
@@ -246,7 +244,7 @@ class StaticLayer(object):
        self._function_spec = FunctionSpec(function, input_spec)
        self._program_cache = ProgramCache()
        self._descriptor_cache = weakref.WeakKeyDictionary()
-        # Note: Hold a reference to ProgramTranslator for switching `enable_declarative`.
+        # Note: Hold a reference to ProgramTranslator for switching `enable_to_static`.
        self._program_trans = ProgramTranslator()
    def __get__(self, instance, owner):
@@ -299,16 +297,17 @@ class StaticLayer(object):
        """
        # 1. call dygraph function directly if not enable `declarative`
-        if not self._program_trans.enable_declarative:
+        if not self._program_trans.enable_to_static:
            logging_utils.warn(
-                "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable=False. "
+                "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable to False. "
-                "We will just return dygraph output.")
+                "We will just return dygraph output. If you would like to get static graph output, please call API "
+                "ProgramTranslator.enable(True)")
            return self._call_dygraph_function(*args, **kwargs)
-        if not in_dygraph_mode() and self._program_trans.enable_declarative:
+        if not in_dygraph_mode():
            raise RuntimeError(
                "Failed to run the callable object {} decorated by '@paddle.jit.to_static', "
-                "because it does NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
+                "because it is NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
                "following API: paddle.disable_static().".format(
                    self.dygraph_function))
@@ -450,7 +449,7 @@ class StaticLayer(object):
                    format(self._function_spec))
        # If more than one programs have been cached, return the recent converted program by default.
        elif cached_program_len > 1:
-            logging.warning(
+            logging_utils.warn(
                "Current {} has more than one cached programs: {}, the last traced progam will be return by default.".
                format(self._function_spec, cached_program_len))
@@ -631,7 +630,7 @@ class ProgramCache(object):
            # Note: raise warnings if number of traced program is more than `max_tracing_count`
            current_tracing_count = len(self._caches)
            if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
-                logging.warning(
+                logging_utils.warn(
                    "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
                    "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".
                    format(current_tracing_count, MAX_TRACED_PROGRAM_COUNT))
@@ -723,15 +722,15 @@ class ProgramTranslator(object):
            return
        self._initialized = True
        self._program_cache = ProgramCache()
-        self.enable_declarative = True
+        self.enable_to_static = True
-    def enable(self, enable_declarative):
+    def enable(self, enable_to_static):
        """
        Enable or disable the converting from imperative to declarative by
        ProgramTranslator globally.
        Args:
-            enable_declarative (bool): True or False to enable or disable declarative.
+            enable_to_static (bool): True or False to enable or disable declarative.
        Returns:
            None.
@@ -760,9 +759,9 @@ class ProgramTranslator(object):
                print(func(x).numpy()) # [[2. 2.]]
        """
-        check_type(enable_declarative, "enable_declarative", bool,
+        check_type(enable_to_static, "enable_to_static", bool,
                   "ProgramTranslator.enable")
-        self.enable_declarative = enable_declarative
+        self.enable_to_static = enable_to_static
    def get_output(self, dygraph_func, *args, **kwargs):
        """
@@ -803,10 +802,13 @@ class ProgramTranslator(object):
        assert callable(
            dygraph_func
        ), "Input dygraph_func is not a callable in ProgramTranslator.get_output"
-        if not self.enable_declarative:
-            warnings.warn(
+        if not self.enable_to_static:
-                "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable = False. "
+            logging_utils.warn(
-                "We will just return dygraph output.")
+                "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable to False. "
+                "We will just return dygraph output. "
+                "Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
            return dygraph_func(*args, **kwargs)
        try:
            function_spec = FunctionSpec(dygraph_func)
@@ -876,10 +878,12 @@ class ProgramTranslator(object):
        assert callable(
            dygraph_func
        ), "Input dygraph_func is not a callable in ProgramTranslator.get_func"
-        if not self.enable_declarative:
-            warnings.warn(
+        if not self.enable_to_static:
-                "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable=False. We will "
+            logging_utils.warn(
-                "just return dygraph output.")
+                "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable to False. We will "
+                "just return dygraph output. Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
            return dygraph_func
        static_func = convert_to_static(dygraph_func)
@@ -929,10 +933,13 @@ class ProgramTranslator(object):
        assert callable(
            dygraph_func
        ), "Input dygraph_func is not a callable in ProgramTranslator.get_program"
-        if not self.enable_declarative:
-            warnings.warn(
+        if not self.enable_to_static:
-                "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable=False."
+            logging_utils.warn(
-                "We will just return dygraph output.")
+                "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable to False."
+                "We will just return dygraph output. "
+                "Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
            return dygraph_func(*args, **kwargs)
        function_spec = FunctionSpec(dygraph_func)

--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -26,6 +26,7 @@ from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticLayer, unwrap_decorators
 from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
@@ -119,8 +120,8 @@ def _dygraph_to_static_func_(dygraph_func):
    # TODO: remove this decorator after we finalize training API
    def __impl__(*args, **kwargs):
        program_translator = ProgramTranslator()
-        if in_dygraph_mode() or not program_translator.enable_declarative:
+        if in_dygraph_mode() or not program_translator.enable_to_static:
-            warnings.warn(
+            logging_utils.warn(
                "The decorator 'dygraph_to_static_func' doesn't work in "
                "dygraph mode or set ProgramTranslator.enable to False. "
                "We will just return dygraph output.")
@@ -215,7 +216,7 @@ def declarative(function=None, input_spec=None):
        if isinstance(function, Layer):
            if isinstance(function.forward, StaticLayer):
                class_name = function.__class__.__name__
-                warnings.warn(
+                logging_utils.warn(
                    "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".
                    format(class_name))
            function.forward = decorated(function.forward)
@@ -832,9 +833,9 @@ def save(layer, model_path, input_spec=None, config=None):
    # 1. input check
    prog_translator = ProgramTranslator()
-    if not prog_translator.enable:
+    if not prog_translator.enable_to_static:
        raise RuntimeError(
-            "The paddle.jit.save doesn't work when setting ProgramTranslator.enable=False."
+            "The paddle.jit.save doesn't work when setting ProgramTranslator.enable to False."
        )
    if not isinstance(layer, Layer):
        raise TypeError(

--- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
@@ -98,7 +98,7 @@ class AutoCheckpointChecker(object):
            self._fs_cache = os.getenv("PADDLE_EDL_FS_CACHE", ".cache")
            self._save_checkpoint_inter = int(
-                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  #s
+                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  # s
            if not self._ce_test:
                assert len(self._hdfs_home) > 3 and \
@@ -132,7 +132,7 @@ class AutoCheckpointChecker(object):
        if in_dygraph_mode():
            return False
-        return  self._run_env is not None and \
+        return self._run_env is not None and \
            self._platform is not None and \
            self._job_id is not None and \
            self._hdfs_home is not None and \

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -11229,7 +11229,7 @@ def shape(input):
                input.shape = [3, 2]
    Args:
-        input (Variable): The input can be N-D Tensor or SelectedRows with data type float16, float32, float64, int32, int64.
+        input (Variable): The input can be N-D Tensor or SelectedRows with data type bool, float16, float32, float64, int32, int64.
                          If input variable is type of SelectedRows, returns the shape of it's inner tensor.
    Returns:
@@ -11253,8 +11253,8 @@ def shape(input):
            print(res) # [array([  3, 100, 100], dtype=int32)]
    """
    check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        input, 'input',
-        'shape')
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'shape')
    helper = LayerHelper('shape', **locals())
    out = helper.create_variable_for_type_inference(dtype='int32')
    helper.append_op(

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -4,6 +4,7 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FL
 set(dist_ENVS http_proxy="" https_proxy="")
 file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
+list(REMOVE_ITEM DIST_TEST_OPS "test_dist_op")
 if(NOT WITH_NCCL)
    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
 endif()
@@ -102,7 +103,6 @@ if(WIN32)
 endif()
-LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_new)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint1)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
@@ -463,8 +463,8 @@ if(WITH_DISTRIBUTE)
 	   #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
        if(NOT WIN32)
            py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
-            #py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
-            #py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
        endif(NOT WIN32)
    endif(NOT APPLE)
    if(WITH_DGC)
@@ -558,7 +558,7 @@ endif()
 set_tests_properties(test_parallel_executor_test_while_train test_parallel_executor_mnist
        test_parallel_executor_feed_persistable_var
        test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
-        test_data_norm_op test_imperative_using_non_zero_gpu
+        test_data_norm_op
        test_dataloader_keep_order
        test_dataloader_unkeep_order
        test_parallel_executor_fetch_isolated_var

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -56,8 +56,30 @@ class TestLoggingUtils(unittest.TestCase):
        with self.assertRaises(TypeError):
            paddle.jit.set_verbosity(3.3)
-    def test_code_level(self):
+    def test_also_to_stdout(self):
+        logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout = None
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout, False)
+        paddle.jit.set_verbosity(also_to_stdout=False)
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout, False)
+        logging_utils._TRANSLATOR_LOGGER.need_to_echo_node_to_stdout = None
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_code_to_stdout, False)
+        paddle.jit.set_code_level(also_to_stdout=True)
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_code_to_stdout, True)
+        with self.assertRaises(AssertionError):
+            paddle.jit.set_verbosity(also_to_stdout=1)
+        with self.assertRaises(AssertionError):
+            paddle.jit.set_code_level(also_to_stdout=1)
+    def test_set_code_level(self):
        paddle.jit.set_code_level(None)
        os.environ[logging_utils.CODE_LEVEL_ENV_NAME] = '2'
        self.assertEqual(logging_utils.get_code_level(), 2)
@@ -71,7 +93,25 @@ class TestLoggingUtils(unittest.TestCase):
        with self.assertRaises(TypeError):
            paddle.jit.set_code_level(3.3)
-    def test_log(self):
+    def test_log_api(self):
+        # test api for CI Converage
+        logging_utils.set_verbosity(1, True)
+        logging_utils.warn("warn")
+        logging_utils.error("error")
+        logging_utils.log(1, "log level 1")
+        logging_utils.log(2, "log level 2")
+        source_code = "x = 3"
+        ast_code = gast.parse(source_code)
+        logging_utils.set_code_level(1, True)
+        logging_utils.log_transformed_code(1, ast_code, "TestTransformer")
+        logging_utils.set_code_level(logging_utils.LOG_AllTransformer, True)
+        logging_utils.log_transformed_code(logging_utils.LOG_AllTransformer,
+                                           ast_code, "TestTransformer")
+    def test_log_message(self):
        stream = io.BytesIO() if six.PY2 else io.StringIO()
        log = self.translator_logger.logger
        stdout_handler = logging.StreamHandler(stream)
@@ -84,13 +124,14 @@ class TestLoggingUtils(unittest.TestCase):
        if six.PY3:
            with mock.patch.object(sys, 'stdout', stream):
+                logging_utils.set_verbosity(1, False)
                logging_utils.warn(warn_msg)
                logging_utils.error(error_msg)
-                self.translator_logger.verbosity_level = 1
                logging_utils.log(1, log_msg_1)
                logging_utils.log(2, log_msg_2)
-            result_msg = '\n'.join([warn_msg, error_msg, log_msg_1, ""])
+            result_msg = '\n'.join(
+                [warn_msg, error_msg, "(Level 1) " + log_msg_1, ""])
            self.assertEqual(result_msg, stream.getvalue())
    def test_log_transformed_code(self):

--- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -19,7 +19,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 java_home = os.environ["JAVA_HOME"]

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+class ConvAffineChannelFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+class ConvAffineChannelFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='VALID',
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+class ConvAffineChannelFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='SAME',
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+class ConvEltwiseAddAffineChannelFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+class ConvEltwiseAddAffineChannelFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='VALID',
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+class ConvEltwiseAddAffineChannelFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='Same',
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+"""Test for fusion of conv and bias."""
+#padding SAME
+class ConvBiasMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="SAME",
+                bias_attr=param_attr)
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+#padding VALID
+class ConvBiasMkldnnFusePassTest1(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                bias_attr=param_attr)
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+#padding number
+class ConvBiasMkldnnFusePassTest2(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding=[2, 4, 6, 8],
+                bias_attr=param_attr)
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+#dilation not supported yet, just print warning log and does not fuse
+class ConvBiasMkldnnFusePassTest3(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                dilation=2,
+                groups=3,
+                bias_attr=param_attr,
+                use_cudnn=False,
+                act="softmax",
+                data_format="NCHW")
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+#all conv params except for dilation
+class ConvBiasMkldnnFusePassTest4(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                groups=3,
+                bias_attr=param_attr,
+                use_cudnn=False,
+                act="softmax",
+                data_format="NCHW")
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+class ConvBnFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+class ConvBnFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='VALID',
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+class ConvBnFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='SAME',
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+class ConvEltwiseAddBnFuseExplicitPaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+class ConvEltwiseAddBnFuseValidPaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='VALID',
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+class ConvEltwiseAddBnFuseSamePaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='SAME',
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv, elementwise_add and 2 act."""
@@ -46,6 +47,9 @@ class ConvElementwiseAdd2ActFusePassTest(InferencePassTest):
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_elementwise_add2_act_fuse_pass'))
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv, elementwise_add and act."""
@@ -48,6 +49,9 @@ class ConvElementwiseAddActFusePassTest(InferencePassTest):
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_elementwise_add_act_fuse_pass'))
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv and elementwise_add."""
@@ -44,6 +45,8 @@ class ConvElementwiseAddFusePassTest(InferencePassTest):
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_elementwise_add_fuse_pass'))
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+class RepeatedFcReluFusePass3Test(InferencePassTest):
+    def setUp(self):
+        fc_num = 3
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                bias_attr=param_attr,
+                act=None)
+            fc_outs = []
+            fc_outs.append(
+                fluid.layers.fc(input=[conv_out], act="relu", size=1000))
+            for i in range(1, fc_num):
+                fc_outs.append(
+                    fluid.layers.fc(
+                        input=[fc_outs[i - 1]], act="relu", size=1000))
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [fc_outs[fc_num - 1]]
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('repeated_fc_relu_fuse_pass'))
+class RepeatedFcReluFusePass9Test(InferencePassTest):
+    def setUp(self):
+        fc_num = 9
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                bias_attr=param_attr,
+                act=None)
+            fc_outs = []
+            fc_outs.append(
+                fluid.layers.fc(input=[conv_out], act="relu", size=1000))
+            for i in range(1, fc_num):
+                fc_outs.append(
+                    fluid.layers.fc(
+                        input=[fc_outs[i - 1]], act="relu", size=1000))
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [fc_outs[fc_num - 1]]
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('repeated_fc_relu_fuse_pass'))
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+class ShuffleChannelFuseTRTPassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            reshape1 = fluid.layers.reshape(x=data, shape=[-1, 2, 3, 64, 64])
+            trans = fluid.layers.transpose(x=reshape1, perm=[0, 2, 1, 3, 4])
+            reshape2 = fluid.layers.reshape(x=trans, shape=[-1, 6, 64, 64])
+            out = fluid.layers.batch_norm(reshape2, is_test=True)
+        self.feeds = {
+            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = ShuffleChannelFuseTRTPassTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('shuffle_channel_detect_pass'))
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
@@ -67,13 +67,13 @@ class AutoCheckpointTestDist(AutoCheckPointACLBase):
        save_dir = "./run_save_0"
        fs.delete(save_dir)
-        #basic
+        # basic
        exe, main_prog, startup_prog = self._generate()
        compiled, data_loader, optimizer, loss, image, label = \
            self._init_env(exe, main_prog, startup_prog, minimize=False)
-        #fleet
+        # fleet
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -26,7 +26,7 @@ def stable_softmax(x):
    return exps / np.sum(exps)
-def log_softmax(x, axis=-1):
+def log_softmax(x, axis=1):
    softmax_out = np.apply_along_axis(stable_softmax, axis, x)
    return np.log(softmax_out)

--- a/python/paddle/fluid/tests/unittests/test_device_guard.py
+++ b/python/paddle/fluid/tests/unittests/test_device_guard.py
@@ -33,6 +33,14 @@ def execute(main_program, startup_program):
    exe.run(main_program)
+def get_vaild_warning_num(warning, w):
+    num = 0
+    for i in range(len(w)):
+        if warning in str(w[i].message):
+            num += 1
+    return num
 class TestDeviceGuard(unittest.TestCase):
    def test_device_guard(self):
        main_program = fluid.Program()
@@ -133,7 +141,10 @@ class TestDeviceGuard(unittest.TestCase):
                        i = fluid.layers.increment(x=i, value=1, in_place=True)
                        fluid.layers.less_than(x=i, y=loop_len, cond=cond)
-        assert len(w) == 1
+        warning = "The Op(while) is not support to set device."
+        warning_num = get_vaild_warning_num(warning, w)
+        assert warning_num == 1
        all_ops = main_program.global_block().ops
        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
        for op in all_ops:
@@ -169,7 +180,10 @@ class TestDeviceGuard(unittest.TestCase):
                        shape=[1], value=4.0, dtype='float32')
                    result = fluid.layers.less_than(x=x, y=y, force_cpu=False)
-        assert len(w) == 2
+        warning = "\'device_guard\' has higher priority when they are used at the same time."
+        warning_num = get_vaild_warning_num(warning, w)
+        assert warning_num == 2
        all_ops = main_program.global_block().ops
        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
        for op in all_ops:

--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -67,6 +67,13 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp):
        self.out = np.floor_divide(self.x, self.y)
+class TestElementwiseModOpInverse(TestElementwiseModOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
 class TestFloorDivideOp(unittest.TestCase):
    def test_name(self):
        with fluid.program_guard(fluid.Program()):

--- a/python/paddle/fluid/tests/unittests/test_empty_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.data_feeder import convert_dtype
+import paddle.fluid.core as core
+from paddle.static import program_guard, Program
+class TestEmptyLikeAPICommon(unittest.TestCase):
+    def __check_out__(self, out):
+        data_type = convert_dtype(out.dtype)
+        self.assertEqual(data_type, self.dst_dtype,
+                         'dtype should be %s, but get %s' %
+                         (self.dst_dtype, data_type))
+        shape = out.shape
+        self.assertTupleEqual(shape, self.dst_shape,
+                              'shape should be %s, but get %s' %
+                              (self.dst_shape, shape))
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(out)
+            min_value = np.nanmin(out)
+            always_non_full_zero = max_value > min_value
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = out.size
+            true_num = np.sum(out == True)
+            false_num = np.sum(out == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+class TestEmptyLikeAPI(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        out = paddle.empty_like(self.x, self.dtype)
+        self.__check_out__(out.numpy())
+        paddle.enable_static()
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float32")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+class TestEmptyLikeAPI2(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float64")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+class TestEmptyLikeAPI3(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+class TestEmptyLikeAPI4(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int64")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+class TestEmptyLikeAPI5(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("bool")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+class TestEmptyLikeAPI6(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float64")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+class TestEmptyLikeAPI7(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+class TestEmptyLikeAPI8(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int64")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+class TestEmptyLikeAPI9(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("bool")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+class TestEmptyLikeAPI10(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float32")
+        self.dtype = "bool"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+class TestEmptyLikeAPI_Static(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+    def test_static_graph(self):
+        dtype = 'float32'
+        train_program = Program()
+        startup_program = Program()
+        with program_guard(train_program, startup_program):
+            x = np.random.random(self.x_shape).astype(dtype)
+            data_x = paddle.static.data(
+                'x', shape=self.data_x_shape, dtype=dtype)
+            out = paddle.empty_like(data_x)
+        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        res = exe.run(train_program, feed={'x': x}, fetch_list=[out])
+        self.dst_dtype = dtype
+        self.dst_shape = x.shape
+        self.__check_out__(res[0])
+    def init_config(self):
+        self.x_shape = (200, 3)
+        self.data_x_shape = [200, 3]
+class TestEmptyLikeAPI_Static2(TestEmptyLikeAPI_Static):
+    def init_config(self):
+        self.x_shape = (3, 200, 3)
+        self.data_x_shape = [-1, 200, 3]
+class TestEmptyError(unittest.TestCase):
+    def test_attr(self):
+        def test_dtype():
+            x = np.random.random((200, 3)).astype("float64")
+            dtype = 'uint8'
+            result = paddle.empty_like(x, dtype=dtype)
+        self.assertRaises(TypeError, test_dtype)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
@@ -40,9 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
            from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
            from paddle.fluid.incubate.fleet.base.role_maker import \
                GeneralRoleMaker
-            from paddle.distributed.fleet.utils import KVHandler
+            from paddle.distributed.fleet.utils.http_server import KVHandler
-            from paddle.distributed.fleet.utils import KVServer
+            from paddle.distributed.fleet.utils.http_server import KVServer
-            from paddle.distributed.fleet.utils import KVHTTPServer
+            from paddle.distributed.fleet.utils.http_server import KVHTTPServer
        except:
            print("warning: no fleet, skip test_pslib_4")
            return

--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -81,12 +81,12 @@ class TestFleetUtil(unittest.TestCase):
        self.assertEqual(user_id, 10)
    def test_fs(self):
-        from paddle.distributed.fleet.utils import LocalFS
+        from paddle.distributed.fleet.utils.fs import LocalFS
        fs = LocalFS()
        dirs, files = fs.ls_dir("test_tmp")
        dirs, files = fs.ls_dir("./")
        self.assertFalse(fs.need_upload_download())
-        fleet_util.set_file_system(fs)
+        fleet_util._set_file_system(fs)
    def test_barrier(self):
        try:

--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -20,7 +20,7 @@ import os
 import sys
 import inspect
-from paddle.distributed.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 class FSTest(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -216,7 +216,7 @@ class API_TestGather(unittest.TestCase):
                      "index": index_np,
                      'axis': axis_np},
                fetch_list=[out])
-            expected_output = gather_numpy(x_np, index_np, axis_np)
+            expected_output = gather_numpy(x_np, index_np, axis_np[0])
        self.assertTrue(np.allclose(result, expected_output))

--- a/python/paddle/fluid/tests/unittests/test_hdfs1.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs1.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 java_home = os.environ["JAVA_HOME"]
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 class FSTest1(FSTestBase):
    def test_timeout(self):

--- a/python/paddle/fluid/tests/unittests/test_hdfs2.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 java_home = os.environ["JAVA_HOME"]
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 class FSTest2(FSTestBase):
    def test_hdfs(self):

--- a/python/paddle/fluid/tests/unittests/test_hdfs3.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 java_home = os.environ["JAVA_HOME"]
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 class FSTest3(FSTestBase):
    def test_hdfs(self):

--- a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
@@ -21,7 +21,6 @@ import numpy as np
 class TestImperativeUsingNonZeroGpu(unittest.TestCase):
    def run_main(self, np_arr, place):
        with guard(place):
-            embedding = Embedding(size=[10, 10])
            var = to_variable(np_arr)
            self.assertTrue(np.array_equal(np_arr, var.numpy()))
@@ -30,7 +29,6 @@ class TestImperativeUsingNonZeroGpu(unittest.TestCase):
            return
        np_arr = np.random.random([11, 13]).astype('float32')
-        self.run_main(np_arr, fluid.CUDAPlace(1))
        self.run_main(np_arr, fluid.CUDAPlace(0))

--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
--- a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
+++ b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
@@ -50,7 +50,7 @@ class TestSaveModelWithoutVar(unittest.TestCase):
                params_filename='params')
            expected_warn = "no variable in your model, please ensure there are any variables in your model to save"
            self.assertTrue(len(w) > 0)
-            self.assertTrue(expected_warn == str(w[0].message))
+            self.assertTrue(expected_warn == str(w[-1].message))
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -99,6 +99,18 @@ class TestCase7(TestTransposeOp):
        self.axis = (0, 1, 3, 2)
+class TestCase8(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
+class TestCase9(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
 class TestTransposeOpError(unittest.TestCase):
    def test_errors(self):
        with program_guard(Program(), Program()):

--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1093,7 +1093,7 @@ def cross_entropy(input,
            " 'none', but received %s, which is not allowed." % reduction)
    #step 1. log_softmax
-    log_softmax_out = paddle.nn.functional.log_softmax(input)
+    log_softmax_out = paddle.nn.functional.log_softmax(input, axis=1)
    if weight is not None and not isinstance(weight, Variable):
        raise ValueError(
            "The weight' is not a Variable, please convert to Variable.")

--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -32,6 +32,21 @@ import random
 import zlib
 import paddle.compat as cpt
+# On macOS, the 'spawn' start method is now the default in Python3.8 multiprocessing,
+# Paddle is currently unable to solve this, so forces the process to start using 
+# the 'fork' start method.
+#
+# TODO: This solution is not good, because the fork start method could lead to 
+# crashes of the subprocess. Figure out how to make 'spawn' work.
+#
+# For more details, please refer to
+# https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+# https://bugs.python.org/issue33725
+if sys.version_info >= (3, 8):
+    fork_context = multiprocessing.get_context('fork')
+else:
+    fork_context = multiprocessing
 def cache(reader):
    """
@@ -560,9 +575,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
            six.reraise(*sys.exc_info())
    def queue_reader():
-        queue = multiprocessing.Queue(queue_size)
+        queue = fork_context.Queue(queue_size)
        for reader in readers:
-            p = multiprocessing.Process(
+            p = fork_context.Process(
                target=_read_into_queue, args=(reader, queue))
            p.start()
@@ -593,9 +608,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
    def pipe_reader():
        conns = []
        for reader in readers:
-            parent_conn, child_conn = multiprocessing.Pipe()
+            parent_conn, child_conn = fork_context.Pipe()
            conns.append(parent_conn)
-            p = multiprocessing.Process(
+            p = fork_context.Process(
                target=_read_into_pipe, args=(reader, child_conn))
            p.start()

--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -41,6 +41,7 @@ from .creation import triu  #DEFINE_ALIAS
 from .creation import tril  #DEFINE_ALIAS
 from .creation import meshgrid  #DEFINE_ALIAS
 from .creation import empty  #DEFINE_ALIAS
+from .creation import empty_like  #DEFINE_ALIAS
 from .io import save  #DEFINE_ALIAS
 from .io import load  #DEFINE_ALIAS
 from .linalg import matmul  #DEFINE_ALIAS

--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
--- a/python/paddle/utils/lazy_import.py
+++ b/python/paddle/utils/lazy_import.py
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py