diff --git a/paddle/fluid/eager/amp_utils.h b/paddle/fluid/eager/amp_utils.h index c63912312b18d439e11b6495a4bd332c2e02216b..115811f6a3d8e4ace1e39a769e3259448fc4c766 100644 --- a/paddle/fluid/eager/amp_utils.h +++ b/paddle/fluid/eager/amp_utils.h @@ -100,7 +100,6 @@ inline paddle::experimental::DataType GetAmpDestDtype( if (paddle::imperative::AmpOperators::Instance() .GetMutableAllowOps() ->count(op_name)) { - paddle::imperative::AmpOperators::Instance().AddToAmpOpList(op_name); return paddle::experimental::DataType::FLOAT16; } else if (paddle::imperative::AmpOperators::Instance() .GetMutableBlockOps() @@ -118,8 +117,6 @@ inline paddle::experimental::DataType GetAmpDestDtype( .GetMutableUnsupportedFp16Ops() ->count(op_name)) { dst_type = paddle::experimental::DataType::FLOAT32; - } else { - paddle::imperative::AmpOperators::Instance().AddToAmpOpList(op_name); } return dst_type; } @@ -132,8 +129,6 @@ inline paddle::experimental::DataType GetAmpDestDtype( .GetMutableBlockOps() ->count(op_name)) { dst_type = paddle::experimental::DataType::FLOAT32; - } else { - paddle::imperative::AmpOperators::Instance().AddToAmpOpList(op_name); } return dst_type; } @@ -142,7 +137,6 @@ inline paddle::experimental::DataType GetAmpDestDtype( if (paddle::imperative::AmpOperators::Instance() .GetMutableAllowOps() ->count(op_name)) { - paddle::imperative::AmpOperators::Instance().AddToAmpOpList(op_name); return paddle::experimental::DataType::BFLOAT16; } else if (paddle::imperative::AmpOperators::Instance() .GetMutableBlockOps() @@ -158,8 +152,6 @@ inline paddle::experimental::DataType GetAmpDestDtype( .GetMutableUnsupportedBf16Ops() ->count(op_name)) { dst_type = paddle::experimental::DataType::FLOAT32; - } else { - paddle::imperative::AmpOperators::Instance().AddToAmpOpList(op_name); } return dst_type; } @@ -172,8 +164,6 @@ inline paddle::experimental::DataType GetAmpDestDtype( .GetMutableBlockOps() ->count(op_name)) { dst_type = paddle::experimental::DataType::FLOAT32; - } else { - paddle::imperative::AmpOperators::Instance().AddToAmpOpList(op_name); } return dst_type; } diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index f8cea38ea6dad7d6626af5ffc17a26b3320d55c7..55c15208208085feb9207cac22d105b4ceb96e80 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -22,7 +22,6 @@ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/var_helper.h" -DECLARE_bool(low_precision_op_list); namespace paddle { namespace imperative { @@ -194,16 +193,6 @@ AmpOperators::GetMutableUnsupportedBf16Ops() { return unsupported_bf16_ops_; } -void AmpOperators::AddToAmpOpList(const std::string& op_name) { - if (FLAGS_low_precision_op_list) { - current_amp_ops_[op_name] += 1; - } -} - -std::map AmpOperators::GetAmpOpList() { - return current_amp_ops_; -} - std::ostream& operator<<(std::ostream& os, AmpOperators& ops) { os << "allow ops: "; auto allow_ops = ops.GetMutableAllowOps(); diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h index 343b01dedb4269c40ae8412d9832306210277087..3bee2308603f9ed0bf98acab0a3c700ea8b737ca 100644 --- a/paddle/fluid/imperative/amp_auto_cast.h +++ b/paddle/fluid/imperative/amp_auto_cast.h @@ -60,10 +60,6 @@ class AmpOperators { std::shared_ptr> GetMutableUnsupportedBf16Ops(); - void AddToAmpOpList(const std::string& op_name); - - std::map GetAmpOpList(); - private: AmpOperators(); // forbid calling default constructor @@ -80,9 +76,6 @@ class AmpOperators { // The set of ops that has no bf16 CUDA kennel. std::shared_ptr> unsupported_bf16_ops_; - - // The amp op list of current module. - std::map current_amp_ops_; }; std::ostream& operator<<(std::ostream& os, AmpOperators& ops); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2b07b0d9cd2631faf4f255d62c1803b134199ca3..c457b14325e7977287cd8b59279211c6b486c659 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2546,7 +2546,7 @@ All parameter, weight, gradient are variables in Paddle. [] { return phi::autotune::AutoTuneStatus::Instance().Update(); }); m.def("get_low_precision_op_list", [] { - return paddle::imperative::AmpOperators::Instance().GetAmpOpList(); + return phi::KernelFactory::Instance().GetLowPrecisionKernelList(); }); m.def("autotune_status", [] { diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py index e1def4e913cdc38d92e0622d4a89c8dfbf38bc81..6e5702fdf7b786c530f8fb845306d0e00b1ce061 100644 --- a/paddle/phi/api/yaml/generator/api_base.py +++ b/paddle/phi/api/yaml/generator/api_base.py @@ -1200,6 +1200,9 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d {code_indent} auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError( {code_indent} "{kernel_name}", {{kernel_backend, kernel_layout, kernel_data_type}}); {code_indent} const auto& kernel = kernel_result.kernel; +{code_indent} if (FLAGS_low_precision_op_list) {{ +{code_indent} phi::KernelFactory::Instance().AddToLowPrecisionKernelList("{self.api}", kernel_data_type); +{code_indent} }} {code_indent} VLOG(6) << "{kernel_name} kernel: " << kernel; {code_indent} auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend); {input_tensors} diff --git a/paddle/phi/api/yaml/generator/api_gen.py b/paddle/phi/api/yaml/generator/api_gen.py index 0a05ec6eb32f7668350fa16dee5c206731eb0348..cb3abebef6cb5d3d1436f429e49dd27222890b66 100644 --- a/paddle/phi/api/yaml/generator/api_gen.py +++ b/paddle/phi/api/yaml/generator/api_gen.py @@ -347,6 +347,7 @@ def source_include(header_file_path): #include "paddle/fluid/platform/profiler/supplement_tracing.h" DECLARE_bool(conv2d_disable_cudnn); +DECLARE_int32(low_precision_op_list); """ diff --git a/paddle/phi/api/yaml/generator/backward_api_gen.py b/paddle/phi/api/yaml/generator/backward_api_gen.py index f01200ec3ac411b492e54f8c2c7a809a301f7818..f50a2318f831551435c41983ff6a87218f4b64b2 100644 --- a/paddle/phi/api/yaml/generator/backward_api_gen.py +++ b/paddle/phi/api/yaml/generator/backward_api_gen.py @@ -290,6 +290,7 @@ def source_include(header_file_path): #include "paddle/fluid/platform/profiler/supplement_tracing.h" DECLARE_bool(conv2d_disable_cudnn); +DECLARE_int32(low_precision_op_list); """ diff --git a/paddle/phi/api/yaml/generator/intermediate_api_gen.py b/paddle/phi/api/yaml/generator/intermediate_api_gen.py index 9cab819aa4a6f62b06edf1894f24a9008200ff68..98e7b9755eae468eb20e11a31dc6db365a372642 100644 --- a/paddle/phi/api/yaml/generator/intermediate_api_gen.py +++ b/paddle/phi/api/yaml/generator/intermediate_api_gen.py @@ -54,6 +54,8 @@ def source_include(header_file_path): #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/supplement_tracing.h" + +DECLARE_int32(low_precision_op_list); """ diff --git a/paddle/phi/api/yaml/generator/sparse_api_gen.py b/paddle/phi/api/yaml/generator/sparse_api_gen.py index 3dbd424be2c6bd49a4c8b40ae71f9d990fe52144..276e2555876392707b83fd7447ce3f1bc7ffaee2 100644 --- a/paddle/phi/api/yaml/generator/sparse_api_gen.py +++ b/paddle/phi/api/yaml/generator/sparse_api_gen.py @@ -221,6 +221,9 @@ class SparseAPI(ForwardAPI): auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError( "{kernel_name}", {{kernel_backend, kernel_layout, kernel_data_type}}); const auto& phi_kernel = kernel_result.kernel; + if (FLAGS_low_precision_op_list) {{ + phi::KernelFactory::Instance().AddToLowPrecisionKernelList("{self.api}", kernel_data_type); + }} VLOG(6) << "{self.api} api sparse kernel: " << phi_kernel; auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend); @@ -324,6 +327,8 @@ def source_include(header_file_path): #include "paddle/phi/infermeta/sparse/unary.h" #include "paddle/phi/infermeta/sparse/binary.h" #include "paddle/phi/infermeta/sparse/multiary.h" + +DECLARE_int32(low_precision_op_list); """ diff --git a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py index b77733273621d90e929af9da9397a0795f94873f..eec3f734545247d3e0fdda03138b0ade56cd9f49 100644 --- a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py +++ b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py @@ -134,6 +134,8 @@ def source_include(header_file_path): #include "paddle/phi/infermeta/sparse/unary.h" #include "paddle/phi/infermeta/sparse/binary.h" #include "paddle/phi/infermeta/sparse/backward.h" + +DECLARE_int32(low_precision_op_list); """ diff --git a/paddle/phi/api/yaml/generator/strings_api_gen.py b/paddle/phi/api/yaml/generator/strings_api_gen.py index 029fa9d8a20a8b6fedcb7cd1419fa6c55860e518..4b5daa3d7084d8e590fd03826e608341723179a8 100644 --- a/paddle/phi/api/yaml/generator/strings_api_gen.py +++ b/paddle/phi/api/yaml/generator/strings_api_gen.py @@ -210,6 +210,9 @@ class StringsAPI(ForwardAPI): VLOG(6) << "{self.api} api strings kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]"; auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError( "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}}); + if (FLAGS_low_precision_op_list) {{ + phi::KernelFactory::Instance().AddToLowPrecisionKernelList("{self.api}", kernel_data_type); + }} const auto& kernel = kernel_result.kernel; VLOG(6) << "{self.api} api strings kernel: " << kernel; @@ -334,6 +337,8 @@ def source_include(header_file_path): #include "paddle/phi/api/lib/api_registry.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/core/kernel_registry.h" + +DECLARE_int32(low_precision_op_list); """ diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index 5f7058fdc1079c3ae0f8be7fe78cc5a56e8b5287..5b8dc47d6498177124bdc53be2d27f97546fe086 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -55,16 +55,19 @@ PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads, /** * Low Precision Op related FLAG * Name: FLAGS_low_precision_op_list - * Since Version: 0.13.0 - * Value Range: bool, default=false + * Since Version: 2.5.0 + * Value Range: int32, default=0 * Example: * Note: Used to debug. Get the low precision op list of current module. + * FLAGS_check_nan_inf is set. + * - 1, return the low precision op list of current module. + * - 2, return the op list of current module. */ -PADDLE_DEFINE_EXPORTED_bool(low_precision_op_list, - false, - "Checking whether get the low precision op list of " - "current module. It will be " - "rerun the low precision list after module."); +PADDLE_DEFINE_EXPORTED_int32(low_precision_op_list, + 0, + "Setting the level of low precision op" + "list printing. It will be return the " + "low precision op list of current module."); /** * Operator related FLAG diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index 992460fe8267c0661564cb138603589229293c0d..0809cfab3f702c86d84a1b9e2aec6425b49219e4 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -23,6 +23,7 @@ #include "paddle/phi/core/compat/op_utils.h" #include "paddle/utils/string/string_helper.h" +DECLARE_int32(low_precision_op_list); DECLARE_bool(enable_api_kernel_fallback); namespace phi { @@ -106,9 +107,33 @@ bool KernelFactory::HasKernel(const std::string& kernel_name, return true; } +void KernelFactory::AddToLowPrecisionKernelList( + const std::string& name, + const paddle::experimental::DataType& kernel_key_type) { + if (FLAGS_low_precision_op_list >= 1) { + auto op_name = phi::TransToFluidOpName(name); + if (op_name.find("_grad") != std::string::npos) { + return; // only record forward api + } + bool is_low_precision = + (kernel_key_type == paddle::experimental::DataType::FLOAT16 || + kernel_key_type == paddle::experimental::DataType::BFLOAT16); + bool need_record = + FLAGS_low_precision_op_list == 1 ? is_low_precision : true; + if (need_record) { + low_precision_kernels_[op_name] += 1; + } + } +} + +std::map KernelFactory::GetLowPrecisionKernelList() { + return low_precision_kernels_; +} + KernelResult KernelFactory::SelectKernelOrThrowError( const std::string& kernel_name, const KernelKey& const_kernel_key) const { auto iter = kernels_.find(kernel_name); + PADDLE_ENFORCE_NE( iter, kernels_.end(), diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h index ad0a83546eb9dabfbb32e1e6a9de0a5fb0bdaf8d..a106ac727c5d0dde0fed1a339057edf146b19d42 100644 --- a/paddle/phi/core/kernel_factory.h +++ b/paddle/phi/core/kernel_factory.h @@ -14,12 +14,12 @@ #pragma once +#include #include #include #include #include #include - #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" @@ -305,10 +305,19 @@ class KernelFactory { const KernelArgsDef& GetFirstKernelArgsDef( const std::string& kernel_name) const; + void AddToLowPrecisionKernelList( + const std::string& name, + const paddle::experimental::DataType& kernel_key_type); + + std::map GetLowPrecisionKernelList(); + private: KernelFactory() = default; KernelNameMap kernels_; + + // Get the low precision kernel list of current module. + std::map low_precision_kernels_; }; inline std::ostream& operator<<(std::ostream& os, const KernelKey& kernel_key) { diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h index ec0d6d213a2298ccfa84df8be3182e9cdd3b1ebd..2284ce0a42ceadef69d1ae7bfed96613091b8d30 100644 --- a/paddle/phi/tests/api/scale_api.h +++ b/paddle/phi/tests/api/scale_api.h @@ -25,6 +25,7 @@ #include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/scale_kernel.h" +DECLARE_int32(low_precision_op_list); namespace paddle { namespace experimental { @@ -54,6 +55,10 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x, auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError( "scale", {kernel_backend, kernel_layout, kernel_data_type}); const auto& kernel = kernel_result.kernel; + if (FLAGS_low_precision_op_list) { + phi::KernelFactory::Instance().AddToLowPrecisionKernelList( + "scale", kernel_data_type); + } VLOG(6) << "scale API kernel key: [" << kernel_backend << ", " << kernel_layout << ", " << kernel_data_type << "]"; VLOG(6) << "scale API kernel: " << kernel; @@ -225,6 +230,10 @@ Tensor scale_switch_case(const Tensor& x, auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError( "scale", {kernel_backend, kernel_layout, kernel_data_type}); const auto& kernel = kernel_result.kernel; + if (FLAGS_low_precision_op_list) { + phi::KernelFactory::Instance().AddToLowPrecisionKernelList( + "scale", kernel_data_type); + } VLOG(6) << "scale API kernel key: [" << kernel_backend << ", " << kernel_layout << ", " << kernel_data_type << "]"; VLOG(6) << "scale API kernel: " << kernel; diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index 6eb63040e082d93b18a9beff22d2e49b8640c2fb..6c8ddbd579359c2f84e07771fa9c05a3aa03ab91 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -13,6 +13,7 @@ # limitations under the License. import copy +import os import warnings import paddle @@ -94,18 +95,23 @@ _g_amp_state_ = None def low_precision_op_list(): - op_list = paddle.fluid.core.get_low_precision_op_list() - op_count = 0 - print('<---------------- low precision op list ------------------->') - print('<---- op name ------|------- op count---------------------->') - for x in op_list: - print(' %-18s| %4d' % (x, op_list[x])) - op_count += 1 - print( - '<------------- low precision op num:{:5d} ----------------->'.format( - op_count + if os.getenv("FLAGS_low_precision_op_list") is not None: + level = int(os.getenv("FLAGS_low_precision_op_list")) + if level == 0: + return + if level == 1: + print('<{:-^60}>'.format(" low precision op list ")) + else: + print('<{:-^60}>'.format(" op list ")) + op_list = paddle.fluid.core.get_low_precision_op_list() + op_count = 0 + print( + '<{:-^40}'.format(" op_name "), '|', '{:-^17}>'.format(" op count ") ) - ) + for x in op_list: + print(' %-40s| %-15d' % (x, op_list[x])) + op_count += 1 + print('<{:-^60}>'.format(" op count: " + str(op_count) + " ")) def amp_state(): diff --git a/python/paddle/fluid/tests/unittests/test_low_precision_list.py b/python/paddle/fluid/tests/unittests/test_low_precision_list.py index afa737bfed45f69a42dc293ddc60b2bbb94ad77f..0641a21be6354ae09007ff051ccef57edb01e72c 100644 --- a/python/paddle/fluid/tests/unittests/test_low_precision_list.py +++ b/python/paddle/fluid/tests/unittests/test_low_precision_list.py @@ -25,12 +25,11 @@ class TestAMPList(unittest.TestCase): b = paddle.rand([2, 3]) # amp list conv2d, cast - with paddle.amp.auto_cast(): + with paddle.amp.auto_cast(enable=True, level='O2'): conv = conv2d(data) c = a + b paddle.amp.low_precision_op_list() op_list = paddle.fluid.core.get_low_precision_op_list() - print(conv.dtype) if conv.dtype == paddle.float16: self.assertTrue('elementwise_add' in op_list) self.assertTrue('conv2d' in op_list)