From 69dd43d123c76bbeee16cccc4daa751349c4de80 Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Tue, 15 Mar 2022 16:26:51 +0800 Subject: [PATCH] [NPU] add AMP O1 support (#40362) * [NPU] add AMP O1 support * [NPU] fix NOTE and warnings --- paddle/fluid/imperative/amp_auto_cast.cc | 4 ++- paddle/fluid/pybind/op_function_generator.h | 1 + python/paddle/fluid/dygraph/amp/auto_cast.py | 11 ++++-- .../paddle/fluid/dygraph/amp/loss_scaler.py | 35 +++++++++++++------ 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 149202468b..dd00b75666 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -209,7 +209,9 @@ inline bool NeedCast(const std::shared_ptr& var) { auto data_type = GetDataType(var); if (paddle::platform::is_gpu_place(place) || paddle::platform::is_cuda_pinned_place(place) || - paddle::platform::is_xpu_place(place)) { + paddle::platform::is_xpu_place(place) || + paddle::platform::is_npu_place(place) || + paddle::platform::is_npu_pinned_place(place)) { // CudaPinndePlace is added for varbase created by dataloader if (data_type == paddle::framework::proto::VarType::FP32 || data_type == paddle::framework::proto::VarType::FP16 || diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 9e86e3df8a..d8750c1d6c 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -88,6 +88,7 @@ std::map> op_ins_map = { {"nce", {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs", "CustomDistAlias", "CustomDistAliasProbs"}}, + {"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}}, }; // NOTE(zhiqiu): Like op_ins_map. diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index 191661b7bf..a449bdf0a1 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -271,14 +271,19 @@ def amp_guard(enable=True, "current_tracer is None, maybe it is not in imperative mode.") # check device_type: - # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16. + # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, npu for float16. # Maybe we will support cpu for bfloat16. if enable and not (tracer._expected_place.is_gpu_place() or - tracer._expected_place.is_xpu_place()): + tracer._expected_place.is_xpu_place() or + tracer._expected_place.is_npu_place()): warnings.warn( - 'amp_guard can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.' + 'amp_guard can only be enabled on CUDAPlace, XPUPlace, and NPUPlace, current place is %s, so it makes no effect.' % tracer._expected_place) enable = False + # For npu: + if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'): + warnings.warn('NPUPlace only support float16 amp.') + enable = False # For xpu: if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'): warnings.warn('XPUPlace only support float16 amp.') diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index f7c2d6be57..3ca4c7dca7 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -105,9 +105,10 @@ class AmpScaler(object): "current_tracer is None, maybe it is not in imperative mode.") if enable and not (tracer._expected_place.is_gpu_place() or - tracer._expected_place.is_xpu_place()): + tracer._expected_place.is_xpu_place() or + tracer._expected_place.is_npu_place()): warnings.warn( - 'AmpScaler can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.' + 'AmpScaler can only be enabled on CUDAPlace, XPUPlace and NPUPlace, current place is %s, so it makes no effect.' % tracer._expected_place) enable = False @@ -286,14 +287,28 @@ class AmpScaler(object): ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32 ) ] - if len(param_grads_fp16): - _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, - param_grads_fp16, - self._temp_found_inf_fp16) - if len(param_grads_fp32): - _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, - param_grads_fp32, - self._temp_found_inf_fp32) + if core.is_compiled_with_npu(): + float_status = _C_ops.alloc_float_status() + _C_ops.clear_float_status(float_status, float_status) + + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + float_status, param_grads_fp16, + self._temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + float_status, param_grads_fp32, + self._temp_found_inf_fp32) + else: + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + param_grads_fp16, + self._temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + param_grads_fp32, + self._temp_found_inf_fp32) + if len(param_grads_fp16) and len(param_grads_fp32): self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32 elif len(param_grads_fp16): -- GitLab