[NPU] add AMP O1 support (#40362)

* [NPU] add AMP O1 support * [NPU] fix NOTE and warnings

[NPU] add AMP O1 support (#40362)
* [NPU] add AMP O1 support * [NPU] fix NOTE and warnings
69dd43d1 · furnace · GitHub · 2c5edb4f · 69dd43d1 · 69dd43d1
4 changed file
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -209,7 +209,9 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
  auto data_type = GetDataType<VarType>(var);
  if (paddle::platform::is_gpu_place(place) ||
      paddle::platform::is_cuda_pinned_place(place) ||
-      paddle::platform::is_xpu_place(place)) {
+      paddle::platform::is_xpu_place(place) ||
+      paddle::platform::is_npu_place(place) ||
+      paddle::platform::is_npu_pinned_place(place)) {
    // CudaPinndePlace is added for varbase created by dataloader
    if (data_type == paddle::framework::proto::VarType::FP32 ||
        data_type == paddle::framework::proto::VarType::FP16 ||

--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -88,6 +88,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
    {"nce",
     {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs",
      "CustomDistAlias", "CustomDistAliasProbs"}},
+    {"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}},
 };

 // NOTE(zhiqiu): Like op_ins_map.

--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -271,14 +271,19 @@ def amp_guard(enable=True,
            "current_tracer is None, maybe it is not in imperative mode.")

    # check device_type:
-    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16.
+    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, npu for float16.
    # Maybe we will support cpu for bfloat16.
    if enable and not (tracer._expected_place.is_gpu_place() or
-                       tracer._expected_place.is_xpu_place()):
+                       tracer._expected_place.is_xpu_place() or
+                       tracer._expected_place.is_npu_place()):
        warnings.warn(
-            'amp_guard can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
+            'amp_guard can only be enabled on CUDAPlace, XPUPlace, and NPUPlace, current place is %s, so it makes no effect.'
            % tracer._expected_place)
        enable = False
+    # For npu:
+    if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'):
+        warnings.warn('NPUPlace only support float16 amp.')
+        enable = False
    # For xpu:
    if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
        warnings.warn('XPUPlace only support float16 amp.')

--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -105,9 +105,10 @@ class AmpScaler(object):
                "current_tracer is None, maybe it is not in imperative mode.")

        if enable and not (tracer._expected_place.is_gpu_place() or
-                           tracer._expected_place.is_xpu_place()):
+                           tracer._expected_place.is_xpu_place() or
+                           tracer._expected_place.is_npu_place()):
            warnings.warn(
-                'AmpScaler can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
+                'AmpScaler can only be enabled on CUDAPlace, XPUPlace and NPUPlace, current place is %s, so it makes no effect.'
                % tracer._expected_place)
            enable = False

@@ -286,14 +287,28 @@ class AmpScaler(object):
                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
                           )
            ]
-        if len(param_grads_fp16):
-            _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
-                                            param_grads_fp16,
-                                            self._temp_found_inf_fp16)
-        if len(param_grads_fp32):
-            _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
-                                            param_grads_fp32,
-                                            self._temp_found_inf_fp32)
+        if core.is_compiled_with_npu():
+            float_status = _C_ops.alloc_float_status()
+            _C_ops.clear_float_status(float_status, float_status)
+
+            if len(param_grads_fp16):
+                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                                float_status, param_grads_fp16,
+                                                self._temp_found_inf_fp16)
+            if len(param_grads_fp32):
+                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                                float_status, param_grads_fp32,
+                                                self._temp_found_inf_fp32)
+        else:
+            if len(param_grads_fp16):
+                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                                param_grads_fp16,
+                                                self._temp_found_inf_fp16)
+            if len(param_grads_fp32):
+                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                                param_grads_fp32,
+                                                self._temp_found_inf_fp32)
+
        if len(param_grads_fp16) and len(param_grads_fp32):
            self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32
        elif len(param_grads_fp16):