From c5affb7807825dc5cf2d58fc5adb721010c5a922 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 9 Feb 2022 15:11:35 +0800 Subject: [PATCH] [pten] fit pten for amp (#39403) * fit pten for amp * fix typo --- paddle/fluid/imperative/amp_auto_cast.cc | 108 +++++++++++++++++++---- paddle/fluid/imperative/amp_auto_cast.h | 6 ++ paddle/fluid/pybind/pybind.cc | 63 +------------ 3 files changed, 101 insertions(+), 76 deletions(-) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 2d97a1e3b6..0913d54c83 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -25,6 +25,80 @@ namespace imperative { class VarBase; +// According to the input `place` and `dtype`, this function returns a tuple +// consists of three sets: +// 1) All operators registered in the Paddle framework. +// 2) All operators supported for `place` and `dtype`. +// 3) All operators unsupported for `place` and `dtype`. +// The input `place` is a type of string, which can only be `GPU` or `CPU`. +// The input `dtype` is a type of paddle::framework::proto::VarType::Type, +// which can be paddle::framework::proto::VarType::FP16, +// paddle::framework::proto::VarType::FP32 and so on. +std::tuple, std::unordered_set, + std::unordered_set> +OpSupportedInfos(const std::string& place, + framework::proto::VarType::Type dtype) { + std::string query_place; + std::transform(place.begin(), place.end(), std::back_inserter(query_place), + [](unsigned char c) { return std::toupper(c); }); + using fn_type = std::add_pointer::type; + std::unordered_map is_target_place{ + {"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place}, + {"XPU", &platform::is_xpu_place}, {"NPU", &platform::is_npu_place}, + {"MLU", &platform::is_mlu_place}, + }; + PADDLE_ENFORCE_NE(is_target_place.count(query_place), 0, + platform::errors::InvalidArgument( + "The argument `place` should be 'GPU', 'CPU', 'XPU', " + "'NPU', 'MLU', but got '%s'.", + place)); + + std::unordered_set all_ops; + const auto& op_info = framework::OpInfoMap::Instance().map(); + for (auto it = op_info.begin(); it != op_info.end(); it++) { + all_ops.emplace(it->first); + } + + std::unordered_set supported_ops; + auto& all_kernels = framework::OperatorWithKernel::AllOpKernels(); + for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) { + for (auto& kernel_type : it->second) { + if (is_target_place[query_place](kernel_type.first.place_) && + kernel_type.first.data_type_ == dtype) { + supported_ops.emplace(it->first); + } + } + } + + auto pten_kernels = pten::KernelFactory::Instance().kernels(); + for (auto& kernel_pair : pten_kernels) { + auto op_type = pten::TransToFluidOpName(kernel_pair.first); + for (auto& info_pair : kernel_pair.second) { + framework::OpKernelType kernel_type = + framework::TransPtenKernelKeyToOpKernelType(info_pair.first); + if (is_target_place[query_place](kernel_type.place_) && + kernel_type.data_type_ == dtype && all_ops.count(op_type)) { + VLOG(4) << op_type << " " << supported_ops.size(); + supported_ops.emplace(op_type); + } + } + } + + std::unordered_set unsupported_ops; + for (auto& op : all_ops) { + if (!supported_ops.count(op)) { + unsupported_ops.emplace(op); + } + } + + VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --"; + VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --"; + VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size() + << " --"; + return std::make_tuple(std::move(all_ops), std::move(supported_ops), + std::move(unsupported_ops)); +} + AutoCastGuard::AutoCastGuard(std::shared_ptr tracer, AmpLevel level) : tracer_(tracer) { pre_amp_level_ = tracer_->GetAmpLevel(); @@ -40,21 +114,25 @@ AmpOperators::AmpOperators() : allow_ops_(new std::unordered_set()), block_ops_(new std::unordered_set()), unsupported_fp16_ops_(new std::unordered_set()) { - auto& all_kernels = framework::OperatorWithKernel::AllOpKernels(); - auto fp16_dtype = framework::proto::VarType::FP16; - for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) { - bool supported = false; - for (auto& kernel_type : it->second) { - if ((platform::is_gpu_place(kernel_type.first.place_) || - platform::is_xpu_place(kernel_type.first.place_)) && - kernel_type.first.data_type_ == fp16_dtype) { - supported = true; - } - } - if (!supported) { - unsupported_fp16_ops_->insert(it->first); - } - } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + auto unsupported_ops_gpu = std::get<2>( + OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16)); + unsupported_fp16_ops_->insert(unsupported_ops_gpu.begin(), + unsupported_ops_gpu.end()); +// NOTE: GPU/NPU/XPU is compiled seperatly. +#elif defined(PADDLE_WITH_ASCEND_CL) + auto unsupported_ops_npu = std::get<2>( + OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16)); + unsupported_fp16_ops_->insert(unsupported_ops_npu.begin(), + unsupported_ops_npu.end()); +#elif defined(PADDLE_WITH_XPU) + auto unsupported_ops_xpu = std::get<2>( + OpSupportedInfos("XPU", paddle::framework::proto::VarType::FP16)); + unsupported_fp16_ops_->insert(unsupported_ops_xpu.begin(), + unsupported_ops_xpu.end()); +#endif + VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " " + << unsupported_fp16_ops_->size(); } AmpOperators::~AmpOperators() {} diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h index 0a45798a52..775f9f973a 100644 --- a/paddle/fluid/imperative/amp_auto_cast.h +++ b/paddle/fluid/imperative/amp_auto_cast.h @@ -19,6 +19,7 @@ #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/type_defs.h" namespace paddle { @@ -32,6 +33,11 @@ enum class AmpLevel { O3, // fp16 }; +std::tuple, std::unordered_set, + std::unordered_set> +OpSupportedInfos(const std::string& place, + framework::proto::VarType::Type dtype); + class Tracer; // Singleton implementation with C++ 11 diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index e31935848a..8f20daf337 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -59,6 +59,7 @@ limitations under the License. */ #include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/mmap_allocator.h" @@ -304,66 +305,6 @@ bool SupportsVNNI() { #endif } -// According to the input `place` and `dtype`, this function returns a tuple -// consists of three sets: -// 1) All operators registered in the Paddle framework. -// 2) All operators supported for `place` and `dtype`. -// 3) All operators unsupported for `place` and `dtype`. -// The input `place` is a type of string, which can only be `GPU` or `CPU`. -// The input `dtype` is a type of paddle::framework::proto::VarType::Type, -// which can be paddle::framework::proto::VarType::FP16, -// paddle::framework::proto::VarType::FP32 and so on. -std::tuple, std::unordered_set, - std::unordered_set> -OpSupportedInfos(const std::string &place, - framework::proto::VarType::Type dtype) { - std::string query_place; - std::transform(place.begin(), place.end(), std::back_inserter(query_place), - [](unsigned char c) { return std::toupper(c); }); - using fn_type = std::add_pointer::type; - std::unordered_map is_target_place{ - {"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place}, - {"XPU", &platform::is_xpu_place}, {"NPU", &platform::is_npu_place}, - {"MLU", &platform::is_mlu_place}, - }; - PADDLE_ENFORCE_NE( - is_target_place.count(query_place), 0, - platform::errors::InvalidArgument( - "The argument `place` should be 'GPU' or 'CPU', but get '%s'.", - place)); - - std::unordered_set all_ops; - const auto &op_info = framework::OpInfoMap::Instance().map(); - for (auto it = op_info.begin(); it != op_info.end(); it++) { - all_ops.emplace(it->first); - } - - std::unordered_set supported_ops; - auto &all_kernels = framework::OperatorWithKernel::AllOpKernels(); - for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) { - for (auto &kernel_type : it->second) { - if (is_target_place[query_place](kernel_type.first.place_) && - kernel_type.first.data_type_ == dtype) { - supported_ops.emplace(it->first); - } - } - } - - std::unordered_set unsupported_ops; - for (auto &op : all_ops) { - if (!supported_ops.count(op)) { - unsupported_ops.emplace(op); - } - } - - VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --"; - VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --"; - VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size() - << " --"; - return std::make_tuple(std::move(all_ops), std::move(supported_ops), - std::move(unsupported_ops)); -} - bool IsCompiledWithBrpc() { #ifndef PADDLE_WITH_DISTRIBUTE return false; @@ -2449,7 +2390,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance); m.def("supports_int8", SupportsInt8); m.def("supports_vnni", SupportsVNNI); - m.def("op_supported_infos", OpSupportedInfos); + m.def("op_supported_infos", imperative::OpSupportedInfos); m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_dist", IsCompiledWithDIST); m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) { -- GitLab