From 9056cc8b12faa4beb037dab1646ac2dc71428292 Mon Sep 17 00:00:00 2001
From: RuohengMa <120699764+RuohengMa@users.noreply.github.com>
Date: Wed, 18 Jan 2023 16:56:21 +0800
Subject: [PATCH] [PHI] remove bitwise and, or, xor (#49916)

* add reduce_sum_int64 and reduce_sum_int8 xpu kernels

* [PHI] add clip grad kernel with support type float32 and int32

* [PHI unittest] add clip_grad unit test

* adapt code to clang-format

* update xpu api output with clip_grad api

* remove int8 support of reduce_sum xpu kernel since it can not pass unit tests

* adapt license date, add code for XPUDataType convertion

* add int8 support of reduce_sum

* add reduce_sum unit tests for dtype int64, int8, and add more test cases

* update license date

* remove buggy bitwise and, or and xor xpu kernels, refine bitwise not xpu kernel

* change license date
---
 cmake/external/xpu.cmake                      |  2 +-
 paddle/phi/backends/xpu/xpu2_op_list.cc       | 12 ++---
 paddle/phi/kernels/clip_grad_kernel.h         |  2 +-
 paddle/phi/kernels/reduce_sum_kernel.cc       | 10 +++--
 paddle/phi/kernels/reduce_sum_kernel.h        |  2 +-
 paddle/phi/kernels/xpu/bitwise.cc             | 45 +++----------------
 paddle/phi/kernels/xpu/clip_grad_kernel.cc    | 44 ++++++++++++++++++
 paddle/phi/kernels/xpu/reduce_sum_kernel.cc   |  5 ++-
 .../tests/unittests/xpu/test_clip_op_xpu.py   | 16 +++++--
 .../unittests/xpu/test_reduce_sum_op_xpu.py   | 40 ++++++++++++++++-
 10 files changed, 121 insertions(+), 57 deletions(-)
 create mode 100644 paddle/phi/kernels/xpu/clip_grad_kernel.cc

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index be088de898..f04e5f9d30 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE
       "https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev")
-  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20230110")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20230114")
 else()
   set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 367231972a..8451ee2774 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -67,10 +67,7 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT64})},
       {"bilinear_interp_v2", XPUKernelSet({phi::DataType::FLOAT32})},
       {"bilinear_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"bitwise_and", XPUKernelSet({phi::DataType::BOOL})},
       {"bitwise_not", XPUKernelSet({phi::DataType::BOOL})},
-      {"bitwise_or", XPUKernelSet({phi::DataType::BOOL})},
-      {"bitwise_xor", XPUKernelSet({phi::DataType::BOOL})},
       {"broadcast", XPUKernelSet({phi::DataType::FLOAT32})},
       {"c_allgather",
        XPUKernelSet({phi::DataType::FLOAT16,
@@ -109,6 +106,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"clip", XPUKernelSet({phi::DataType::FLOAT32})},
       {"clip_by_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"clip_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
       {"coalesce_tensor",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"concat_grad",
@@ -435,7 +434,10 @@ XPUOpMap& get_kl2_ops() {
       {"reduce_min", XPUKernelSet({phi::DataType::FLOAT32})},
       {"reduce_prod", XPUKernelSet({phi::DataType::FLOAT32})},
       {"reduce_sum_grad", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"reduce_sum", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_sum",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT8,
+                     phi::DataType::INT64})},
       {"relu6", XPUKernelSet({phi::DataType::FLOAT32})},
       {"relu6_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"relu_grad",
diff --git a/paddle/phi/kernels/clip_grad_kernel.h b/paddle/phi/kernels/clip_grad_kernel.h
index 8a7e5b99fd..bc6245ce90 100644
--- a/paddle/phi/kernels/clip_grad_kernel.h
+++ b/paddle/phi/kernels/clip_grad_kernel.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc
index c6cfe42566..a3ff565fce 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -27,7 +27,8 @@ void SumKernel(const Context& dev_ctx,
                bool keep_dim,
                DenseTensor* out) {
   bool reduce_all = recompute_reduce_all(x, dims);
-  SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
+  SumRawKernel<T, Context>(
+      dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
 }
 
 }  // namespace phi
@@ -82,5 +83,8 @@ PD_REGISTER_KERNEL(
 #endif
 
 #if defined(PADDLE_WITH_XPU)
-PD_REGISTER_KERNEL(sum, XPU, ALL_LAYOUT, phi::SumKernel, float) {}
+PD_REGISTER_KERNEL(
+    sum, XPU, ALL_LAYOUT, phi::SumKernel, float, int8_t, int64_t) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
 #endif
diff --git a/paddle/phi/kernels/reduce_sum_kernel.h b/paddle/phi/kernels/reduce_sum_kernel.h
index 3bcf025d96..e994b073fc 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.h
+++ b/paddle/phi/kernels/reduce_sum_kernel.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/phi/kernels/xpu/bitwise.cc b/paddle/phi/kernels/xpu/bitwise.cc
index a897a37acd..019acf52f8 100644
--- a/paddle/phi/kernels/xpu/bitwise.cc
+++ b/paddle/phi/kernels/xpu/bitwise.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -19,51 +19,18 @@
 
 namespace phi {
 
-template <typename T, typename Context>
-void BitwiseAndKernel(const Context& ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      DenseTensor* out) {
-  ctx.template Alloc<T>(out);
-  int r = xpu::logical_and(
-      ctx.x_context(), x.data<T>(), y.data<T>(), out->data<T>(), x.numel());
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "bitwise and");
-}
-
-template <typename T, typename Context>
-void BitwiseOrKernel(const Context& ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     DenseTensor* out) {
-  ctx.template Alloc<T>(out);
-  int r = xpu::logical_or(
-      ctx.x_context(), x.data<T>(), y.data<T>(), out->data<T>(), x.numel());
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "bitwise or");
-}
-
-template <typename T, typename Context>
-void BitwiseXorKernel(const Context& ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      DenseTensor* out) {
-  ctx.template Alloc<T>(out);
-  int r = xpu::logical_xor(
-      ctx.x_context(), x.data<T>(), y.data<T>(), out->data<T>(), x.numel());
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "bitwise xor");
-}
-
 template <typename T, typename Context>
 void BitwiseNotKernel(const Context& ctx,
                       const DenseTensor& x,
                       DenseTensor* out) {
+  using XPUDataType = typename XPUTypeTrait<T>::Type;
   ctx.template Alloc<T>(out);
-  int r =
-      xpu::logical_not(ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
+  int r = xpu::logical_not(ctx.x_context(),
+                           reinterpret_cast<const XPUDataType*>(x.data<T>()),
+                           reinterpret_cast<XPUDataType*>(out->data<T>()),
+                           x.numel());
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "bitwise not");
 }
 }  // namespace phi
 
-PD_REGISTER_KERNEL(bitwise_and, XPU, ALL_LAYOUT, phi::BitwiseAndKernel, bool) {}
-PD_REGISTER_KERNEL(bitwise_or, XPU, ALL_LAYOUT, phi::BitwiseOrKernel, bool) {}
-PD_REGISTER_KERNEL(bitwise_xor, XPU, ALL_LAYOUT, phi::BitwiseXorKernel, bool) {}
 PD_REGISTER_KERNEL(bitwise_not, XPU, ALL_LAYOUT, phi::BitwiseNotKernel, bool) {}
diff --git a/paddle/phi/kernels/xpu/clip_grad_kernel.cc b/paddle/phi/kernels/xpu/clip_grad_kernel.cc
new file mode 100644
index 0000000000..ff1cc21660
--- /dev/null
+++ b/paddle/phi/kernels/xpu/clip_grad_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/clip_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ClipGradKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const Scalar& min,
+                    const Scalar& max,
+                    DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  using XPUDataType = typename XPUTypeTrait<T>::Type;
+  int r =
+      xpu::clip_grad(ctx.x_context(),
+                     reinterpret_cast<const XPUDataType*>(x.data<T>()),
+                     reinterpret_cast<const XPUDataType*>(out_grad.data<T>()),
+                     reinterpret_cast<XPUDataType*>(x_grad->data<T>()),
+                     x.numel(),
+                     min.to<T>(),
+                     max.to<T>());
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_grad");
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    clip_grad, XPU, ALL_LAYOUT, phi::ClipGradKernel, float, int) {}
diff --git a/paddle/phi/kernels/xpu/reduce_sum_kernel.cc b/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
index ac13dc3de3..dd3abc7bad 100644
--- a/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -46,4 +46,5 @@ void SumRawKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sum_raw, XPU, ALL_LAYOUT, phi::SumRawKernel, float) {}
+PD_REGISTER_KERNEL(
+    sum_raw, XPU, ALL_LAYOUT, phi::SumRawKernel, float, int8_t, int64_t) {}
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
index 075ff7f7e8..4bf88d40b7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ from xpu.get_test_cover_info import (
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+from paddle.fluid import Program, core, program_guard
 
 
 class XPUTestClipOp(XPUOpTestWrapper):
@@ -51,7 +51,7 @@ class XPUTestClipOp(XPUOpTestWrapper):
 
         def set_xpu(self):
             self.__class__.use_xpu = True
-            self.__class__.no_need_check_grad = True
+            self.__class__.no_need_check_grad = False
             self.__class__.op_type = self.dtype
 
         def init_data(self):
@@ -91,6 +91,16 @@ class XPUTestClipOp(XPUOpTestWrapper):
             self.check_output_with_place(self.place)
             paddle.disable_static()
 
+        def test_check_grad(self):
+            if hasattr(self, "no_need_check_grad") and self.no_need_check_grad:
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(
+                    self.place, ['X'], 'Out', check_eager=True
+                )
+                paddle.disable_static()
+
     class TestClipOp1(TestClipOp):
         def init_data(self):
             self.shape = (8, 16, 8)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
index d8a1e9efcb..2ffc6c2d22 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ class XPUTestReduceSumOp(XPUOpTestWrapper):
                 'reduce_all': self.reduce_all,
                 'keep_dim': self.keep_dim,
             }
-            self.inputs = {'X': np.random.random(self.shape).astype("float32")}
+            self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
             if self.attrs['reduce_all']:
                 self.outputs = {'Out': self.inputs['X'].sum()}
             else:
@@ -63,6 +63,7 @@ class XPUTestReduceSumOp(XPUOpTestWrapper):
             self.axis = (0,)
             self.reduce_all = False
             self.keep_dim = False
+            self.dtype = self.in_type
 
         def test_check_output(self):
             self.check_output_with_place(self.place)
@@ -71,12 +72,47 @@ class XPUTestReduceSumOp(XPUOpTestWrapper):
             self.check_grad_with_place(self.place, ['X'], 'Out')
 
     class XPUTestReduceSumCase1(XPUTestReduceSumBase):
+        def init_case(self):
+            self.shape = (5, 6, 10)
+            self.axis = (0,)
+            self.reduce_all = False
+            self.keep_dim = False
+
+    class XPUTestReduceSumCase2(XPUTestReduceSumBase):
         def init_case(self):
             self.shape = (5, 6, 10)
             self.axis = (0,)
             self.reduce_all = False
             self.keep_dim = True
 
+    class XPUTestReduceSumCase3(XPUTestReduceSumBase):
+        def init_case(self):
+            self.shape = (5, 6, 10)
+            self.axis = (0,)
+            self.reduce_all = True
+            self.keep_dim = False
+
+    class XPUTestReduceSumCase4(XPUTestReduceSumBase):
+        def init_case(self):
+            self.shape = (5, 6, 10)
+            self.axis = (1,)
+            self.reduce_all = False
+            self.keep_dim = False
+
+    class XPUTestReduceSumCase5(XPUTestReduceSumBase):
+        def init_case(self):
+            self.shape = (5, 6, 10)
+            self.axis = (1,)
+            self.reduce_all = False
+            self.keep_dim = True
+
+    class XPUTestReduceSumCase6(XPUTestReduceSumBase):
+        def init_case(self):
+            self.shape = (5, 6, 10)
+            self.axis = (1,)
+            self.reduce_all = True
+            self.keep_dim = False
+
 
 support_types = get_xpu_op_support_types('reduce_sum')
 for stype in support_types:
-- 
GitLab