Re-implemented check_finite_and_unscale_op with newly added xdnn api (#42960)

* Re-implemented check_finite_and_unscale_op with newly added xdnn api *test=kunlun * Re-implemented check_finite_and_unscale_op with newly added xdnn api *test=kunlun

Re-implemented check_finite_and_unscale_op with newly added xdnn api (#42960)
* Re-implemented check_finite_and_unscale_op with newly added xdnn api *test=kunlun * Re-implemented check_finite_and_unscale_op with newly added xdnn api *test=kunlun
6197fbf6 · enzodechine · GitHub · b07f469b · 6197fbf6 · 6197fbf6
2 changed file
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -15,9 +15,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
 template <typename T>
 class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
  using MPDType = typename details::MPTypeTrait<T>::Type;
@@ -38,6 +41,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
    // cpy to cpu
    bool cpu_found_inf_data = false;
+    // number of inf and nans
+    int nums_inf_nans = 0;
    MPDType cpu_scale_data;
    if (platform::is_xpu_place(scale->place())) {
      memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_scale_data),
@@ -52,48 +57,21 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
      const auto* x = xs[i];
      auto* out = outs[i];
      out->mutable_data<T>(dev_ctx.GetPlace());
-      framework::Tensor is_finite =
+      framework::Tensor inf_nan_count =
-          ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
+          ctx.AllocateTmpTensor<int, platform::XPUDeviceContext>(
-                                                                  dev_ctx);
+              found_inf->dims(), dev_ctx);
-      framework::Tensor is_nan =
-          ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
+      if (nums_inf_nans == 0) {
-                                                                  dev_ctx);
+        int r = xpu::count_nan_or_inf(
-      framework::Tensor is_finite_and_nan =
+            dev_ctx.x_context(), reinterpret_cast<const XPUTyp*>(x->data<T>()),
-          ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
+            inf_nan_count.data<int>(), x->numel());
-                                                                  dev_ctx);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "count_nan_or_inf");
-      if (cpu_found_inf_data == false) {
+        memory::Copy(platform::CPUPlace(), &nums_inf_nans, dev_ctx.GetPlace(),
-        int r = xpu::isfinite(dev_ctx.x_context(),
+                     inf_nan_count.data<int>(), sizeof(int));
-                              reinterpret_cast<const XPUTyp*>(x->data<T>()),
-                              is_finite.data<bool>(), x->numel());
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(isfinite) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
-        r = xpu::logical_not(
-            dev_ctx.x_context(),
-            reinterpret_cast<const bool*>(is_finite.data<bool>()),
-            is_finite.data<bool>(), x->numel());
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(logical_not) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
-        r = xpu::any(dev_ctx.x_context(), is_finite.data<bool>(),
-                     found_inf_data, x->numel());
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(any) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
-        if (dev_ctx.x_context()->xpu_stream) {
-          dev_ctx.Wait();
-        }
-        memory::Copy(platform::CPUPlace(), &cpu_found_inf_data,
-                     dev_ctx.GetPlace(), found_inf_data, sizeof(bool));
      }
-      if (cpu_found_inf_data) {
+      if (nums_inf_nans > 0) {
+        cpu_found_inf_data = true;
        inverse_scale = 0.0;
      }
@@ -109,45 +87,25 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
        int r = xpu::cast_v2(dev_ctx.x_context(),
                             reinterpret_cast<const float16*>(x->data<T>()),
                             float_x.data<MPDType>(), x->numel());
-        PADDLE_ENFORCE_EQ(
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(cast_v2) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
        r = xpu::scale(dev_ctx.x_context(), float_x.data<MPDType>(),
                       float_out.data<MPDType>(), x->numel(), false,
                       inverse_scale, 0.0);
-        PADDLE_ENFORCE_EQ(
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(scale) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
        r = xpu::cast_v2(dev_ctx.x_context(), float_out.data<MPDType>(),
                         reinterpret_cast<float16*>(out->data<T>()),
                         out->numel());
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(cast_v2) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
      } else {
        int r = xpu::scale(dev_ctx.x_context(),
                           reinterpret_cast<const XPUTyp*>(x->data<T>()),
                           reinterpret_cast<XPUTyp*>(out->data<T>()),
                           x->numel(), false, inverse_scale, 0.0);
-        PADDLE_ENFORCE_EQ(
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(scale) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
      }
    }
-    if (dev_ctx.x_context()->xpu_stream) {
-      dev_ctx.Wait();
-    }
    memory::Copy(dev_ctx.GetPlace(), found_inf_data, platform::CPUPlace(),
                 &cpu_found_inf_data, sizeof(bool));
  }

--- a/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
@@ -19,84 +19,126 @@ import paddle
 import unittest
 import numpy as np
 from op_test_xpu import XPUOpTest
-from op_test import OpTest, skip_check_grad_ci
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
-import paddle.fluid as fluid
 paddle.enable_static()
-class TestCheckFiniteAndUnscaleOp(XPUOpTest):
+class XPUTestCheckFiniteAndUnscaleOp(XPUOpTestWrapper):
-    def setUp(self):
+    def __init__(self):
-        self.op_type = "check_finite_and_unscale"
+        self.op_name = 'check_finite_and_unscale'
-        self.init_dtype()
+        self.use_dynamic_create_class = False
-        x = np.random.random((1024, 1024)).astype(self.dtype)
-        scale = np.random.random((1)).astype(self.dtype)
+    class TestCheckFiniteAndUnscaleOpNormal(XPUOpTest):
-        # self.attrs = {'stop_gradient': True}
-        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        def setUp(self):
-        self.outputs = {
+            self.op_type = "check_finite_and_unscale"
-            'FoundInfinite': np.array([0]),
+            self.init_dtype()
-            'Out': [('out0', x / scale)],
+            x = np.random.random((8, 8)).astype(self.dtype)
-        }
+            scale = np.random.random((1)).astype(np.float32)
+            self.inputs = {'X': [('x0', x)], 'Scale': scale}
-    def init_dtype(self):
+            self.outputs = {
-        self.dtype = np.float32
+                'FoundInfinite': np.array([0]),
+                'Out': [('out0', x / scale)],
-    def test_check_output(self):
+            }
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
+        def init_dtype(self):
-            self.check_output_with_place(place)
+            self.dtype = self.in_type
+        def test_check_output(self):
-# class TestCheckFiniteAndUnscaleOpWithNan(XPUOpTest):
+            if paddle.is_compiled_with_xpu():
-#     def setUp(self):
+                place = paddle.XPUPlace(0)
-#         self.op_type = "check_finite_and_unscale"
+                self.check_output_with_place(place)
-#         self.init_dtype()
-#         x = np.random.random((1024, 1024)).astype(self.dtype)
+    class TestCheckFiniteAndUnscaleOpWithNan(XPUOpTest):
-#         x[128][128] = np.nan
-#         print("x shape = ", x.shape)
+        def setUp(self):
-#         print(x)
+            self.op_type = "check_finite_and_unscale"
-#         scale = np.random.random((1)).astype(self.dtype)
+            self.init_dtype()
+            x = np.random.random((256, 256)).astype(self.dtype)
-#         self.inputs = {'X': [('x0', x)], 'Scale': scale}
+            idx1 = np.random.randint(255)
-#         self.outputs = {
+            idx2 = np.random.randint(255)
-#             'FoundInfinite': np.array([1]),
+            x[idx1][idx2] = np.nan
-#             'Out': [('out0', x)],
+            x[idx2][idx1] = np.nan
-#         }
+            scale = np.random.random((1)).astype(np.float32)
-#     def init_dtype(self):
+            self.inputs = {'X': [('x0', x)], 'Scale': scale}
-#         self.dtype = np.float32
+            self.outputs = {
+                'FoundInfinite': np.array([1]),
-#     def test_check_output(self):
+                'Out': [('out0', x)],
-#         # When input contains nan, do not check the output,
+            }
-#         # since the output may be nondeterministic and will be discarded.
-#         if paddle.is_compiled_with_xpu():
+        def init_dtype(self):
-#             place = paddle.XPUPlace(0)
+            self.dtype = self.in_type
-#             self.check_output_with_place(place, no_check_set=['Out'])
+        def test_check_output(self):
-# class TestCheckFiniteAndUnscaleOpWithInf(XPUOpTest):
+            # When input contains nan, do not check the output,
-#     def setUp(self):
+            # since the output may be nondeterministic and will be discarded.
-#         self.op_type = "check_finite_and_unscale"
+            if paddle.is_compiled_with_xpu():
-#         self.init_dtype()
+                place = paddle.XPUPlace(0)
-#         x = np.random.random((1024, 1024)).astype(self.dtype)
+                self.check_output_with_place(place, no_check_set=['Out'])
-#         x[128][128] = np.inf
-#         scale = np.random.random((1)).astype(self.dtype)
+    class TestCheckFiniteAndUnscaleOpWithInf(XPUOpTest):
-#         self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        def setUp(self):
-#         self.outputs = {
+            self.op_type = "check_finite_and_unscale"
-#             'FoundInfinite': np.array([1]),
+            self.init_dtype()
-#             'Out': [('out0', x)],
+            x = np.random.random((256, 256)).astype(self.dtype)
-#         }
+            idx1 = np.random.randint(255)
+            idx2 = np.random.randint(255)
-#     def init_dtype(self):
+            x[idx1][idx2] = np.nan
-#         self.dtype = np.float32
+            x[idx2][idx1] = np.nan
+            scale = np.random.random((1)).astype(np.float32)
-#     def test_check_output(self):
+            myscale = np.array([0.05]).astype(self.dtype)
-#         # When input contains inf, do not check the output,
+            self.inputs = {'X': [('x0', x)], 'Scale': scale}
-#         # since the output may be nondeterministic and will be discarded.
+            self.outputs = {
-#         if paddle.is_compiled_with_xpu():
+                'FoundInfinite': np.array([1]),
-#             place = paddle.XPUPlace(0)
+                'Out': [('out0', x)],
-#             self.check_output_with_place(place, no_check_set=['Out'])
+            }
+        def init_dtype(self):
+            self.dtype = self.in_type
+        def test_check_output(self):
+            # When input contains inf, do not check the output,
+            # since the output may be nondeterministic and will be discarded.
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place, no_check_set=['Out'])
+    class TestCheckFiniteAndUnscaleOpWithInfAndNan(XPUOpTest):
+        def setUp(self):
+            self.op_type = "check_finite_and_unscale"
+            self.init_dtype()
+            x = np.random.random((256, 256)).astype(self.dtype)
+            idx1 = np.random.randint(255)
+            idx2 = np.random.randint(255)
+            x[idx1][idx2] = np.inf
+            x[idx2][idx1] = np.nan
+            scale = np.random.random((1)).astype(np.float32)
+            myscale = np.array([0.05]).astype(self.dtype)
+            self.inputs = {'X': [('x0', x)], 'Scale': scale}
+            self.outputs = {
+                'FoundInfinite': np.array([1]),
+                'Out': [('out0', x)],
+            }
+        def init_dtype(self):
+            self.dtype = self.in_type
+        def test_check_output(self):
+            # When input contains inf, do not check the output,
+            # since the output may be nondeterministic and will be discarded.
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place, no_check_set=['Out'])
+support_types = get_xpu_op_support_types('check_finite_and_unscale')
+for stype in support_types:
+    create_test_class(globals(), XPUTestCheckFiniteAndUnscaleOp, stype)
 if __name__ == '__main__':
    unittest.main()