Add data type of int, int64 for add kernel. Modify the code style of (#50443)

instance_norm_grad kernel. Fix bugs that the data type of input is different from output in reduce_sum kernel. test=kunlun

Add data type of int, int64 for add kernel. Modify the code style of (#50443)
instance_norm_grad kernel. Fix bugs that the data type of input is different from output in reduce_sum kernel. test=kunlun
62bff0e0 · Leo Guo · GitHub · f7267412 · 62bff0e0 · 62bff0e0
11 changed file
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -120,6 +120,7 @@ XPUOpMap& get_kl2_ops() {
                     phi::DataType::FLOAT16,
                     phi::DataType::FLOAT64,
                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
                     phi::DataType::UINT8,
                     phi::DataType::INT64,
                     phi::DataType::INT32})},
@@ -286,6 +287,7 @@ XPUOpMap& get_kl2_ops() {
       XPUKernelSet({phi::DataType::INT64,
                     phi::DataType::INT32,
                     phi::DataType::INT16,
+                     phi::DataType::INT8,
                     phi::DataType::UINT8,
                     phi::DataType::BOOL,
                     phi::DataType::FLOAT32,

--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -335,4 +335,20 @@ namespace phi {
    }                                                                         \
  }()

+#define PD_VISIT_XPU_REDUCE_TYPES(TYPE, NAME, ...)                             \
+  [&] {                                                                        \
+    const auto& __dtype__ = TYPE;                                              \
+    switch (__dtype__) {                                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT8, int8_t, __VA_ARGS__)   \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT32, int32_t, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT64, int64_t, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(                                                    \
+          NAME, ::phi::DataType::FLOAT16, phi::float16, __VA_ARGS__)           \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::FLOAT32, float, __VA_ARGS__) \
+      default:                                                                 \
+        PADDLE_THROW(phi::errors::InvalidArgument(                             \
+            "Invalid enum data type `%d`.", static_cast<int>(__dtype__)));     \
+    }                                                                          \
+  }()
+
 }  // namespace phi
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -88,6 +88,7 @@ PD_REGISTER_KERNEL(full,
                   phi::FullKernel,
                   float,
                   double,
+                   int8_t,
                   uint8_t,
                   int16_t,
                   int,

--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -304,9 +304,14 @@ PD_REGISTER_KERNEL(divide,
 PD_REGISTER_KERNEL(
    divide, XPU, ALL_LAYOUT, phi::DivideKernel, phi::dtype::float16, float) {}

-PD_REGISTER_KERNEL(
-    add, XPU, ALL_LAYOUT, phi::AddKernel, phi::dtype::float16, float, int64_t) {
-}
+PD_REGISTER_KERNEL(add,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::AddKernel,
+                   phi::dtype::float16,
+                   float,
+                   int,
+                   int64_t) {}

 PD_REGISTER_KERNEL(multiply,
                   XPU,

--- a/paddle/phi/kernels/xpu/cast_kernel.cc
+++ b/paddle/phi/kernels/xpu/cast_kernel.cc
@@ -72,6 +72,13 @@ void CastKernel(const Context& dev_ctx,
          dev_ctx.template Alloc<bool>(out),
          numel);
      break;
+    case phi::DataType::INT8:
+      r = xpu::cast<XPUInTDType, int8_t>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUInTDType*>(in_data),
+          dev_ctx.template Alloc<int8_t>(out),
+          numel);
+      break;
    case phi::DataType::UINT8:
      r = xpu::cast<XPUInTDType, uint8_t>(
          dev_ctx.x_context(),
@@ -104,6 +111,7 @@ PD_REGISTER_KERNEL(cast,
                   phi::dtype::float16,
                   int64_t,
                   bool,
+                   int8_t,
                   uint8_t,
                   double) {
  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);

--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -119,6 +119,7 @@ PD_REGISTER_KERNEL(full,
                   ALL_LAYOUT,
                   phi::FullKernel,
                   float,
+                   int8_t,
                   uint8_t,
                   int16_t,
                   int,

--- a/paddle/phi/kernels/xpu/instance_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/instance_norm_grad_kernel.cc
@@ -77,27 +77,45 @@ void InstanceNormGradKernel(const Context& dev_ctx,
                          scale_ptr->dims()));
  }

-  DenseTensor scale_tmp;
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  float* scale_ptr_data_tmp;
  int r;
  if (!scale_ptr) {
-    scale_tmp.Resize({C});
-    dev_ctx.template Alloc<T>(&scale_tmp);
+    scale_ptr_data_tmp = RAII_GUARD.alloc_l3_or_gm<float>(C);
    r = xpu::constant(dev_ctx.x_context(),
-                      reinterpret_cast<XPUType*>(scale_tmp.data<T>()),
-                      scale_tmp.numel(),
-                      static_cast<XPUType>(1));
+                      reinterpret_cast<float*>(scale_ptr_data_tmp),
+                      C,
+                      static_cast<float>(1));
    PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
  }
-  auto scale_ptr_tmp = scale_ptr ? scale_ptr : &scale_tmp;
+  auto scale_ptr_data =
+      scale_ptr ? scale_ptr->data<float>() : scale_ptr_data_tmp;

-  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  if ((H * W * D) == 1) {
+    r = xpu::copy(dev_ctx.x_context(),
+                  reinterpret_cast<const XPUType*>(d_y.data<T>()),
+                  reinterpret_cast<XPUType*>(d_x->data<T>()),
+                  d_y.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+    r = xpu::constant(dev_ctx.x_context(),
+                      reinterpret_cast<float*>(d_scale),
+                      C,
+                      static_cast<float>(0));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+    r = xpu::constant(dev_ctx.x_context(),
+                      reinterpret_cast<float*>(d_bias),
+                      C,
+                      static_cast<float>(0));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+    return;
+  }
  auto d_x_data =
      d_x ? d_x->data<T>() : RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
  r = xpu::instance_norm_grad(dev_ctx.x_context(),
                              reinterpret_cast<const XPUType*>(x.data<T>()),
                              reinterpret_cast<const XPUType*>(d_y.data<T>()),
                              reinterpret_cast<XPUType*>(d_x_data),
-                              scale_ptr_tmp->data<float>(),
+                              scale_ptr_data,
                              saved_mean.data<float>(),
                              saved_variance.data<float>(),
                              d_scale_data,

--- a/paddle/phi/kernels/xpu/reduce.h
+++ b/paddle/phi/kernels/xpu/reduce.h
@@ -19,6 +19,10 @@
 #include <string>
 #include <vector>

+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/xpu/reduce_util.h"
+
 namespace phi {

 template <typename Context, typename T>
@@ -82,4 +86,89 @@ int XPUReduce(const Context& dev_ctx,
  return r;
 }

+template <typename DeviceContext, typename T, typename OutT, typename Functor>
+void ReduceKernelImpl(const DeviceContext& dev_ctx,
+                      const phi::DenseTensor& input,
+                      phi::DenseTensor* output,
+                      const std::vector<int>& xdims,
+                      const std::vector<int>& reduce_dims) {
+  using XPUType = typename XPUTypeTrait<OutT>::Type;
+  dev_ctx.template Alloc<OutT>(output);
+  const auto* x_data = input.data<OutT>();
+  auto* y_data = output->data<OutT>();
+  if (reduce_dims.size() == 0) {
+    int r = xpu::copy<XPUType>(dev_ctx.x_context(),
+                               reinterpret_cast<const XPUType*>(x_data),
+                               reinterpret_cast<XPUType*>(y_data),
+                               input.numel() * sizeof(T));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+  } else {
+    Functor func;
+    func(dev_ctx.x_context(), x_data, y_data, xdims, reduce_dims);
+  }
+}
+
+template <typename DeviceContext, typename T, typename Functor>
+void XPUReduce(const DeviceContext& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               bool reduce_all,
+               DataType out_dtype,
+               DenseTensor* out) {
+  reduce_all = recompute_reduce_all(x, dims, reduce_all);
+
+  const auto& input_dim_size = x.dims().size();
+  std::vector<int> true_dims;
+  for (size_t i = 0; i < dims.size(); ++i) {
+    if (dims[i] < 0) {
+      true_dims.push_back(dims[i] + input_dim_size);
+    } else {
+      true_dims.push_back(dims[i]);
+    }
+  }
+  std::vector<int> reduce_dims;
+  std::vector<int> xdims((input_dim_size));
+  for (int i = 0; i < input_dim_size; ++i) {
+    xdims[i] = x.dims()[i];
+  }
+  if (reduce_all) {
+    for (int i = 0; i < input_dim_size; ++i) {
+      reduce_dims.push_back(i);
+    }
+  } else {
+    std::set<int> dims_set(true_dims.begin(), true_dims.end());
+    for (auto i = 0; i < input_dim_size; i++) {
+      if (dims_set.find(i) != dims_set.end()) {
+        if (x.dims()[i] != 1) {
+          reduce_dims.push_back(i);
+        }
+      }
+    }
+  }
+  // no need to cast dtype
+  if (out_dtype == phi::DataType::UNDEFINED || out_dtype == x.dtype()) {
+    // do reduce sum
+    PD_VISIT_XPU_REDUCE_TYPES(
+        x.dtype(), "ReduceKernelImpl", ([&] {
+          phi::ReduceKernelImpl<DeviceContext, T, data_t, Functor>(
+              dev_ctx, x, out, xdims, reduce_dims);
+        }));
+  } else {
+    // cast x tensor to out_dtype
+    auto tmp_tensor = phi::Cast<T, DeviceContext>(dev_ctx, x, out_dtype);
+
+    // do reduce sum
+    PD_VISIT_XPU_REDUCE_TYPES(
+        out_dtype, "ReduceKernelImpl", ([&] {
+          phi::ReduceKernelImpl<DeviceContext, T, data_t, Functor>(
+              dev_ctx, tmp_tensor, out, xdims, reduce_dims);
+        }));
+
+    if (dev_ctx.x_context()->xpu_stream) {
+      dev_ctx.Wait();
+    }
+  }
+}
+
 }  // namespace phi
--- a/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
@@ -14,7 +14,6 @@

 #include "paddle/phi/kernels/reduce_sum_kernel.h"

-#include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/xpu/reduce.h"
@@ -29,23 +28,11 @@ void SumRawKernel(const Context& dev_ctx,
                  bool reduce_all,
                  DataType out_dtype,
                  DenseTensor* out) {
-  reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
-  auto f = [](xpu::Context* ctx,
-              const T* x,
-              T* y,
-              const std::vector<int>& xdims,
-              const std::vector<int>& reduce_dims) {
-    return xpu::reduce_sum<XPUType>(ctx,
-                                    reinterpret_cast<const XPUType*>(x),
-                                    reinterpret_cast<XPUType*>(y),
-                                    xdims,
-                                    reduce_dims);
-  };
-  int r = XPUReduce<Context, T>(
-      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out, f);
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+  if (out_dtype == DataType::UNDEFINED && out->dtype() != x.dtype()) {
+    out_dtype = out->dtype();
+  }
+  XPUReduce<Context, T, phi::SumFunctor>(
+      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out_dtype, out);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/reduce_util.h
+++ b/paddle/phi/kernels/xpu/reduce_util.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+
+namespace phi {
+
+//////// Sum Functor ///////
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y>
+  void operator()(const DeviceContext& ctx,
+                  const X* x,
+                  Y* y,
+                  const std::vector<int>& xdims,
+                  const std::vector<int>& reduce_dims) {
+    using XPUType = typename XPUTypeTrait<X>::Type;
+    int r = xpu::reduce_sum<XPUType>(ctx,
+                                     reinterpret_cast<const XPUType*>(x),
+                                     reinterpret_cast<XPUType*>(y),
+                                     xdims,
+                                     reduce_dims);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+  }
+};
+}  // namespace phi
--- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
@@ -35,6 +35,7 @@ typeid_dict = {
    'float32': int(core.VarDesc.VarType.FP32),
    'float16': int(core.VarDesc.VarType.FP16),
    'bool': int(core.VarDesc.VarType.BOOL),
+    'int8': int(core.VarDesc.VarType.INT8),
    'uint8': int(core.VarDesc.VarType.UINT8),
    'float64': int(core.VarDesc.VarType.FP64),
 }
@@ -53,6 +54,7 @@ class XPUTestCastOp(XPUOpTestWrapper):
            'float32',
            'int32',
            'int64',
+            'int8',
            'uint8',
            'bool',
            'float64',