Support multi-output feature for elementwise (#38410)

* first commit * pass ctest of elementwise_div_grad

Support multi-output feature for elementwise (#38410)
* first commit * pass ctest of elementwise_div_grad
48f061fb · limingshu · GitHub · 85f5d264 · 48f061fb · 48f061fb
6 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -162,7 +162,8 @@ struct DimensionsTransform {
  }
 };
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
+          int NumOuts = 1>
 void LaunchBroadcastElementwiseCudaKernel(
    const platform::CUDADeviceContext &ctx,
    const std::vector<const framework::Tensor *> &ins,
@@ -190,11 +191,12 @@ void LaunchBroadcastElementwiseCudaKernel(
  for (int i = 0; i < pt_outputs_tmp.size(); i++) {
    pt_outputs.push_back(pt_outputs_tmp[i].get());
  }
-  pten::LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT>(
+  pten::LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
      ctx, pt_inputs, &pt_outputs, axis, func);
 }
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
+          int NumOuts = 1>
 void LaunchElementwiseCudaKernel(
    const platform::CUDADeviceContext &cuda_ctx,
    const std::vector<const framework::Tensor *> &ins,
@@ -222,8 +224,8 @@ void LaunchElementwiseCudaKernel(
  for (int i = 0; i < pt_outputs_tmp.size(); i++) {
    pt_outputs.push_back(pt_outputs_tmp[i].get());
  }
-  pten::LaunchElementwiseCudaKernel<ET, InT, OutT>(cuda_ctx, pt_inputs,
+  pten::LaunchElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-                                                   &pt_outputs, axis, func);
+      cuda_ctx, pt_inputs, &pt_outputs, axis, func);
 }
 }  // namespace operators

--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -38,7 +38,8 @@ namespace kps = paddle::operators::kernel_primitives;
 using ElementwiseType = pten::ElementwiseType;
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
+          int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
    const platform::CUDADeviceContext &ctx,
    const std::vector<const framework::Tensor *> &ins,
@@ -66,8 +67,8 @@ void LaunchSameDimsElementwiseCudaKernel(
  for (int i = 0; i < pt_outputs_tmp.size(); i++) {
    pt_outputs.push_back(pt_outputs_tmp[i].get());
  }
-  pten::LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT>(ctx, pt_inputs,
+  pten::LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-                                                           &pt_outputs, func);
+      ctx, pt_inputs, &pt_outputs, func);
 }
 }  // namespace operators

--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h
+++ b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h
@@ -19,7 +19,11 @@ limitations under the License. */
 namespace pten {
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+template <ElementwiseType ET,
+          typename InT,
+          typename OutT,
+          typename Functor,
+          int NumOuts = 1>
 void LaunchElementwiseCudaKernel(
    const paddle::platform::CUDADeviceContext &cuda_ctx,
    const std::vector<const DenseTensor *> &ins,
@@ -33,14 +37,14 @@ void LaunchElementwiseCudaKernel(
    dims_size.emplace_back(in->dims().size());
  }
  if (no_broadcast_flag) {
-    LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT>(
+    LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
        cuda_ctx, ins, outs, func);
  } else {
    axis = axis == -1
               ? *std::max_element(dims_size.begin(), dims_size.end()) -
                     *std::min_element(dims_size.begin(), dims_size.end())
               : axis;
-    LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT>(
+    LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
        cuda_ctx, ins, outs, axis, func);
  }
 }

--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h
+++ b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h
@@ -208,7 +208,7 @@ __device__ void ElementwiseBroadcastKernelImpl(
    int block_offset,
    Functor func) {
  InT args[Arity][VecSize];
-  OutType<OutT, NumOuts> result[VecSize];
+  ConditionalT<OutT, NumOuts> result[VecSize];
 #pragma unroll
  for (int i = 0; i < Arity; i++) {
@@ -224,7 +224,7 @@ __device__ void ElementwiseBroadcastKernelImpl(
  constexpr bool kCallElementwiseAny =
      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
  ElementwisePrimitiveCaller<InT,
-                             OutType<OutT, NumOuts>,
+                             ConditionalT<OutT, NumOuts>,
                             VecSize,
                             Functor,
                             Arity,
@@ -455,18 +455,17 @@ void LaunchBroadcastElementwiseCudaKernel(
                        "is %d, the arity of functor is %d.",
                        ins.size(),
                        kArity));
-  PADDLE_ENFORCE_EQ(kArity,
+  PADDLE_ENFORCE_LE(kArity,
-                    2,
+                    ElementwiseType::kTernary,
                    paddle::platform::errors::InvalidArgument(
-                        "Currently only broadcast of binary is supported and "
+                        "Currently only broadcast of ternary is supported "
-                        "verified, but received %d.",
+                        "and verified, but received %d.",
                        kArity));
-  PADDLE_ENFORCE_EQ(
+  PADDLE_ENFORCE_EQ(outs->size(),
-      outs->size(),
                    NumOuts,
                    paddle::platform::errors::InvalidArgument(
                        "Number of outputs shall equal to number of functions, "
-          "but number of outputs is %d, number of functions is %d.",
+                        "but number of outputs is %d, of functions is %d.",
                        outs->size(),
                        NumOuts));
  int in_vec_size = 4;

--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
+++ b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
@@ -27,7 +27,7 @@ enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 };
 /* Packing scalar type T(float, int etc.) into Array<T, NumOuts> type
   for supporting multiple-output feature in elementwise system.*/
 template <class T, int Num>
-using OutType =
+using ConditionalT =
    typename std::conditional_t<Num == 1, T, paddle::framework::Array<T, Num>>;
 template <typename InT,
@@ -86,7 +86,7 @@ template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
 struct ElementwiseWriteDataCaller {
  __device__ __forceinline__ void operator()(
      paddle::framework::Array<OutT *, NumOuts> outs,
-      OutType<OutT, NumOuts> src[VecSize],
+      ConditionalT<OutT, NumOuts> src[VecSize],
      int block_offset,
      int num) {
    OutT dst[NumOuts][VecSize];

--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h
+++ b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h
@@ -55,16 +55,17 @@ template <typename InT,
          typename OutT,
          typename Functor,
          int Arity,
+          int NumOuts,
          int VecSize,
          bool IsBoundary>
 __device__ void VectorizedElementwiseKernelImpl(
    const paddle::framework::Array<const InT *__restrict__, Arity> &in,
-    OutT *out,
+    paddle::framework::Array<OutT *, NumOuts> outs,
    int num,
    int data_offset,
    Functor func) {
  InT args[Arity][VecSize];
-  OutT result[VecSize];
+  ConditionalT<OutT, NumOuts> result[VecSize];
 #pragma unroll
  for (int i = 0; i < Arity; i++) {
@@ -73,36 +74,53 @@ __device__ void VectorizedElementwiseKernelImpl(
        args[i], in[i] + data_offset, num);
  }
-  const bool kCallElementwiseAny =
+  constexpr bool kCallElementwiseAny =
      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
  ElementwisePrimitiveCaller<InT,
-                             OutT,
+                             ConditionalT<OutT, NumOuts>,
                             VecSize,
                             Functor,
                             Arity,
                             kCallElementwiseAny>()(func, args, result);
-  kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
-      out + data_offset, result, num);
+  ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
+      outs, result, data_offset, num);
 }
-template <typename InT, typename OutT, typename Functor, int Arity, int VecSize>
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize>
 __global__ void VectorizedElementwiseKernel(
    paddle::framework::Array<const InT *__restrict__, Arity> ins,
-    OutT *out,
+    paddle::framework::Array<OutT *, NumOuts> outs,
    int size,
    int main_offset,
    Functor func) {
  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
  for (; data_offset < main_offset; data_offset += stride) {
-    VectorizedElementwiseKernelImpl<InT, OutT, Functor, Arity, VecSize, false>(
+    VectorizedElementwiseKernelImpl<InT,
-        ins, out, VecSize * BLOCK_NUM_X, data_offset, func);
+                                    OutT,
+                                    Functor,
+                                    Arity,
+                                    NumOuts,
+                                    VecSize,
+                                    false>(
+        ins, outs, VecSize * BLOCK_NUM_X, data_offset, func);
  }
  int num = size - data_offset;
  if (num > 0) {
-    VectorizedElementwiseKernelImpl<InT, OutT, Functor, Arity, VecSize, true>(
+    VectorizedElementwiseKernelImpl<InT,
-        ins, out, num, data_offset, func);
+                                    OutT,
+                                    Functor,
+                                    Arity,
+                                    NumOuts,
+                                    VecSize,
+                                    true>(ins, outs, num, data_offset, func);
  }
 }
@@ -121,7 +139,12 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
  return vec_size;
 }
-template <typename InT, typename OutT, typename Functor, int Arity, int VecSize>
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize>
 void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
                           const std::vector<const DenseTensor *> &ins,
                           std::vector<DenseTensor *> *outs,
@@ -131,11 +154,15 @@ void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
  int grid_size =
      ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
  auto stream = ctx.stream();
-  OutT *out_data = (*outs)[0]->mutable_data<OutT>();
  paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
-  for (int i = 0; i < Arity; i++) {
+  paddle::framework::Array<OutT *, NumOuts> outs_data;
+  for (int i = 0; i < Arity; ++i) {
    ins_data[i] = ins[i]->data<InT>();
  }
+  for (int i = 0; i < NumOuts; ++i) {
+    outs_data[i] = (*outs)[i]->mutable_data<OutT>();
+  }
 #ifdef PADDLE_WITH_XPU2
  block_size = 128;
  grid_size = 8;
@@ -144,20 +171,26 @@ void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
                              OutT,
                              Functor,
                              Arity,
+                              NumOuts,
                              VecSize><<<grid_size, block_size, 0, stream>>>(
-      ins_data, out_data, numel, main_offset, func);
+      ins_data, outs_data, numel, main_offset, func);
 #else
  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
  VectorizedElementwiseKernel<InT,
                              OutT,
                              Functor,
                              Arity,
+                              NumOuts,
                              VecSize><<<grid_size, block_size, 0, stream>>>(
-      ins_data, out_data, numel, main_offset, func);
+      ins_data, outs_data, numel, main_offset, func);
 #endif
 }
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+template <ElementwiseType ET,
+          typename InT,
+          typename OutT,
+          typename Functor,
+          int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
    const paddle::platform::CUDADeviceContext &ctx,
    const std::vector<const DenseTensor *> &ins,
@@ -174,19 +207,39 @@ void LaunchSameDimsElementwiseCudaKernel(
                        "is %d, the arity of functor is %d.",
                        ins.size(),
                        kArity));
+  PADDLE_ENFORCE_EQ(outs->size(),
+                    NumOuts,
+                    paddle::platform::errors::InvalidArgument(
+                        "Number of outputs shall equal to number of functions, "
+                        "but number of outputs is %d, of functions is %d.",
+                        outs->size(),
+                        NumOuts));
+  if (NumOuts > 1) {
+    for (int i = 1; i < NumOuts; ++i) {
+      PADDLE_ENFORCE_EQ(
+          (*outs)[i]->dims(),
+          (*outs)[0]->dims(),
+          paddle::platform::errors::InvalidArgument(
+              "The shape of each output tensor shall be identical yet, "
+              "but %dth output tensor`s shape is not.",
+              i));
+    }
+  }
  // calculate the max vec_size for all ins and outs
  int vec_size = GetVectorizedSizeForTensors<InT, OutT>(ins, *outs);
  switch (vec_size) {
    case 4:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, 4>(
+      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 4>(
          ctx, ins, outs, func);
      break;
    case 2:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, 2>(
+      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 2>(
          ctx, ins, outs, func);
      break;
    case 1:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, 1>(
+      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 1>(
          ctx, ins, outs, func);
      break;
    default: {