From c5affb7807825dc5cf2d58fc5adb721010c5a922 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 9 Feb 2022 15:11:35 +0800
Subject: [PATCH] [pten] fit pten for amp (#39403)

* fit pten for amp

* fix typo
---
 paddle/fluid/imperative/amp_auto_cast.cc | 108 +++++++++++++++++++----
 paddle/fluid/imperative/amp_auto_cast.h  |   6 ++
 paddle/fluid/pybind/pybind.cc            |  63 +------------
 3 files changed, 101 insertions(+), 76 deletions(-)

diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 2d97a1e3b6..0913d54c83 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -25,6 +25,80 @@ namespace imperative {
 
 class VarBase;
 
+// According to the input `place` and `dtype`, this function returns a tuple
+// consists of three sets:
+// 1) All operators registered in the Paddle framework.
+// 2) All operators supported for `place` and `dtype`.
+// 3) All operators unsupported for `place` and `dtype`.
+// The input `place` is a type of string, which can only be `GPU` or `CPU`.
+// The input `dtype` is a type of paddle::framework::proto::VarType::Type,
+// which can be paddle::framework::proto::VarType::FP16,
+// paddle::framework::proto::VarType::FP32 and so on.
+std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
+           std::unordered_set<std::string>>
+OpSupportedInfos(const std::string& place,
+                 framework::proto::VarType::Type dtype) {
+  std::string query_place;
+  std::transform(place.begin(), place.end(), std::back_inserter(query_place),
+                 [](unsigned char c) { return std::toupper(c); });
+  using fn_type = std::add_pointer<bool(const platform::Place&)>::type;
+  std::unordered_map<std::string, fn_type> is_target_place{
+      {"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place},
+      {"XPU", &platform::is_xpu_place}, {"NPU", &platform::is_npu_place},
+      {"MLU", &platform::is_mlu_place},
+  };
+  PADDLE_ENFORCE_NE(is_target_place.count(query_place), 0,
+                    platform::errors::InvalidArgument(
+                        "The argument `place` should be 'GPU', 'CPU', 'XPU', "
+                        "'NPU', 'MLU', but got '%s'.",
+                        place));
+
+  std::unordered_set<std::string> all_ops;
+  const auto& op_info = framework::OpInfoMap::Instance().map();
+  for (auto it = op_info.begin(); it != op_info.end(); it++) {
+    all_ops.emplace(it->first);
+  }
+
+  std::unordered_set<std::string> supported_ops;
+  auto& all_kernels = framework::OperatorWithKernel::AllOpKernels();
+  for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
+    for (auto& kernel_type : it->second) {
+      if (is_target_place[query_place](kernel_type.first.place_) &&
+          kernel_type.first.data_type_ == dtype) {
+        supported_ops.emplace(it->first);
+      }
+    }
+  }
+
+  auto pten_kernels = pten::KernelFactory::Instance().kernels();
+  for (auto& kernel_pair : pten_kernels) {
+    auto op_type = pten::TransToFluidOpName(kernel_pair.first);
+    for (auto& info_pair : kernel_pair.second) {
+      framework::OpKernelType kernel_type =
+          framework::TransPtenKernelKeyToOpKernelType(info_pair.first);
+      if (is_target_place[query_place](kernel_type.place_) &&
+          kernel_type.data_type_ == dtype && all_ops.count(op_type)) {
+        VLOG(4) << op_type << " " << supported_ops.size();
+        supported_ops.emplace(op_type);
+      }
+    }
+  }
+
+  std::unordered_set<std::string> unsupported_ops;
+  for (auto& op : all_ops) {
+    if (!supported_ops.count(op)) {
+      unsupported_ops.emplace(op);
+    }
+  }
+
+  VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --";
+  VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --";
+  VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size()
+          << " --";
+  return std::make_tuple(std::move(all_ops), std::move(supported_ops),
+                         std::move(unsupported_ops));
+}
+
 AutoCastGuard::AutoCastGuard(std::shared_ptr<Tracer> tracer, AmpLevel level)
     : tracer_(tracer) {
   pre_amp_level_ = tracer_->GetAmpLevel();
@@ -40,21 +114,25 @@ AmpOperators::AmpOperators()
     : allow_ops_(new std::unordered_set<std::string>()),
       block_ops_(new std::unordered_set<std::string>()),
       unsupported_fp16_ops_(new std::unordered_set<std::string>()) {
-  auto& all_kernels = framework::OperatorWithKernel::AllOpKernels();
-  auto fp16_dtype = framework::proto::VarType::FP16;
-  for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
-    bool supported = false;
-    for (auto& kernel_type : it->second) {
-      if ((platform::is_gpu_place(kernel_type.first.place_) ||
-           platform::is_xpu_place(kernel_type.first.place_)) &&
-          kernel_type.first.data_type_ == fp16_dtype) {
-        supported = true;
-      }
-    }
-    if (!supported) {
-      unsupported_fp16_ops_->insert(it->first);
-    }
-  }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  auto unsupported_ops_gpu = std::get<2>(
+      OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16));
+  unsupported_fp16_ops_->insert(unsupported_ops_gpu.begin(),
+                                unsupported_ops_gpu.end());
+// NOTE: GPU/NPU/XPU is compiled seperatly.
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  auto unsupported_ops_npu = std::get<2>(
+      OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
+  unsupported_fp16_ops_->insert(unsupported_ops_npu.begin(),
+                                unsupported_ops_npu.end());
+#elif defined(PADDLE_WITH_XPU)
+  auto unsupported_ops_xpu = std::get<2>(
+      OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16));
+  unsupported_fp16_ops_->insert(unsupported_ops_xpu.begin(),
+                                unsupported_ops_xpu.end());
+#endif
+  VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
+          << unsupported_fp16_ops_->size();
 }
 
 AmpOperators::~AmpOperators() {}
diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h
index 0a45798a52..775f9f973a 100644
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -19,6 +19,7 @@
 #include <tuple>
 #include <unordered_set>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
@@ -32,6 +33,11 @@ enum class AmpLevel {
   O3,      // fp16
 };
 
+std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
+           std::unordered_set<std::string>>
+OpSupportedInfos(const std::string& place,
+                 framework::proto::VarType::Type dtype);
+
 class Tracer;
 
 // Singleton implementation with C++ 11
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e31935848a..8f20daf337 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -59,6 +59,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/trainer.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
@@ -304,66 +305,6 @@ bool SupportsVNNI() {
 #endif
 }
 
-// According to the input `place` and `dtype`, this function returns a tuple
-// consists of three sets:
-// 1) All operators registered in the Paddle framework.
-// 2) All operators supported for `place` and `dtype`.
-// 3) All operators unsupported for `place` and `dtype`.
-// The input `place` is a type of string, which can only be `GPU` or `CPU`.
-// The input `dtype` is a type of paddle::framework::proto::VarType::Type,
-// which can be paddle::framework::proto::VarType::FP16,
-// paddle::framework::proto::VarType::FP32 and so on.
-std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
-           std::unordered_set<std::string>>
-OpSupportedInfos(const std::string &place,
-                 framework::proto::VarType::Type dtype) {
-  std::string query_place;
-  std::transform(place.begin(), place.end(), std::back_inserter(query_place),
-                 [](unsigned char c) { return std::toupper(c); });
-  using fn_type = std::add_pointer<bool(const platform::Place &)>::type;
-  std::unordered_map<std::string, fn_type> is_target_place{
-      {"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place},
-      {"XPU", &platform::is_xpu_place}, {"NPU", &platform::is_npu_place},
-      {"MLU", &platform::is_mlu_place},
-  };
-  PADDLE_ENFORCE_NE(
-      is_target_place.count(query_place), 0,
-      platform::errors::InvalidArgument(
-          "The argument `place` should be 'GPU' or 'CPU', but get '%s'.",
-          place));
-
-  std::unordered_set<std::string> all_ops;
-  const auto &op_info = framework::OpInfoMap::Instance().map();
-  for (auto it = op_info.begin(); it != op_info.end(); it++) {
-    all_ops.emplace(it->first);
-  }
-
-  std::unordered_set<std::string> supported_ops;
-  auto &all_kernels = framework::OperatorWithKernel::AllOpKernels();
-  for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
-    for (auto &kernel_type : it->second) {
-      if (is_target_place[query_place](kernel_type.first.place_) &&
-          kernel_type.first.data_type_ == dtype) {
-        supported_ops.emplace(it->first);
-      }
-    }
-  }
-
-  std::unordered_set<std::string> unsupported_ops;
-  for (auto &op : all_ops) {
-    if (!supported_ops.count(op)) {
-      unsupported_ops.emplace(op);
-    }
-  }
-
-  VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --";
-  VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --";
-  VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size()
-          << " --";
-  return std::make_tuple(std::move(all_ops), std::move(supported_ops),
-                         std::move(unsupported_ops));
-}
-
 bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
   return false;
@@ -2449,7 +2390,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
   m.def("supports_int8", SupportsInt8);
   m.def("supports_vnni", SupportsVNNI);
-  m.def("op_supported_infos", OpSupportedInfos);
+  m.def("op_supported_infos", imperative::OpSupportedInfos);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
   m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) {
-- 
GitLab