From 3419de531fe1bf9941540a22e6e088253ba08c59 Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Mon, 10 May 2021 20:58:59 +0800
Subject: [PATCH] Support different data type between input and output (#32823)

---
 paddle/fluid/operators/abs_op.cu              |  97 +++++++++++------
 paddle/fluid/operators/activation_op.cu       |  16 +--
 .../elementwise/elementwise_add_op.cu         |   2 +-
 .../elementwise/elementwise_op_impl.cu.h      | 100 ++++++++++--------
 4 files changed, 129 insertions(+), 86 deletions(-)
diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu
index e373d628f6..97409e6cb1 100644
--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
@@ -13,44 +13,79 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/abs_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T, typename Enable = void>
+struct CudaAbsFunctor;
+
+template <typename T>
+struct CudaAbsFunctor<T, math::Complex<T, math::Real<T>>> {
+  __device__ __forceinline__ math::Real<T> operator()(const T* args) const {
+    return abs(args[0]);
+  }
+};
+
+template <typename T>
+struct CudaAbsFunctor<T, math::NoComplex<T, math::Real<T>>> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return std::abs(args[0]);
+  }
+};
+
+template <typename T>
+class AbsKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    out->mutable_data<math::Real<T>>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    std::vector<const framework::Tensor*> ins = {x};
+    std::vector<framework::Tensor*> outs = {out};
+    auto functor = CudaAbsFunctor<T>();
+    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, math::Real<T>>(
+        dev_ctx, ins, &outs, functor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
 REGISTER_OP_CUDA_KERNEL(
-    abs, ops::AbsKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex64>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex128>);
+    abs, ops::AbsKernel<plat::CUDADeviceContext, float>,
+    ops::AbsKernel<plat::CUDADeviceContext, double>,
+    ops::AbsKernel<plat::CUDADeviceContext, int>,
+    ops::AbsKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::complex64>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::complex128>);
 
 REGISTER_OP_CUDA_KERNEL(
-    abs_grad, ops::AbsGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex64>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex128>);
+    abs_grad, ops::AbsGradKernel<plat::CUDADeviceContext, float>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, double>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, int>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex64>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex128>);
 
 REGISTER_OP_CUDA_KERNEL(
-    abs_grad_grad,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::float16>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::complex64>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::complex128>);
+    abs_grad_grad, ops::AbsDoubleGradKernel<plat::CUDADeviceContext, float>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, double>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex64>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex128>);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 22f8147111..618f17031b 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1315,8 +1315,8 @@ class ActivationCudaKernel
     for (auto& attr : attrs) {
       *attr.second = ctx.Attr<float>(attr.first);
     }
-    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T>(dev_ctx, ins, &outs,
-                                                            functor);
+    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(dev_ctx, ins,
+                                                               &outs, functor);
   }
 };
 
@@ -1345,17 +1345,17 @@ class ActivationGradCudaKernel
     if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
       // Only need forward output Out
       ins.push_back(out);
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(dev_ctx, ins,
-                                                               &outs, functor);
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, functor);
     } else if (static_cast<int>(Functor::FwdDeps()) ==
                static_cast<int>(kDepX)) {
       // Only need forward input X
       ins.push_back(x);
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(dev_ctx, ins,
-                                                               &outs, functor);
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, functor);
     } else {
-      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T>(dev_ctx, ins,
-                                                              &outs, functor);
+      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+          dev_ctx, ins, &outs, functor);
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 5c444e752e..dc9c18ba03 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -45,7 +45,7 @@ struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
                   framework::Tensor* z) {
     std::vector<const framework::Tensor*> ins = {x, y};
     std::vector<framework::Tensor*> outs = {z};
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
         ctx.template device_context<platform::CUDADeviceContext>(), ins, &outs,
         CudaAddFunctor<T>());
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 321826ec64..38b1afbdc3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -49,69 +49,73 @@ int GetVectorizedSizeImpl(const T *pointer) {
   return 1;
 }
 
-template <typename T>
+template <typename InT, typename OutT>
 int GetVectorizedSize(const std::vector<const framework::Tensor *> &ins,
                       const std::vector<framework::Tensor *> &outs) {
   int vec_size = 4;
   for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
     vec_size =
-        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<T>()));
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<InT>()));
   }
   for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
     vec_size =
-        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<T>()));
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<OutT>()));
   }
   return vec_size;
 }
 
-template <ElementwiseType ET, int VecSize, typename T>
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT>
 struct ElementwiseDataWrapper {
-  T *out;
-  const T *in0;
-  const T *in1;
-  __device__ ElementwiseDataWrapper(T *out, const T *in0,
-                                    const T *in1 = nullptr)
+  OutT *out;
+  const InT *in0;
+  const InT *in1;
+  __device__ ElementwiseDataWrapper(OutT *out, const InT *in0,
+                                    const InT *in1 = nullptr)
       : out(out), in0(in0), in1(in1) {}
 
-  using VecType = CudaAlignedVector<T, VecSize>;
+  using InVecType = CudaAlignedVector<InT, VecSize>;
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;
 
-  inline __device__ void load_vector(VecType args[], int idx) {
-    const VecType *x_vec = reinterpret_cast<const VecType *>(in0);
+  inline __device__ void load_vector(InVecType args[], int idx) {
+    const InVecType *x_vec = reinterpret_cast<const InVecType *>(in0);
     args[0] = x_vec[idx];
     if (ET == ElementwiseType::kBinary) {
-      const VecType *y_vec = reinterpret_cast<const VecType *>(in1);
+      const InVecType *y_vec = reinterpret_cast<const InVecType *>(in1);
       args[1] = y_vec[idx];
     }
   }
 
-  inline __device__ void load_scalar(T args[], int idx) {
+  inline __device__ void load_scalar(InT args[], int idx) {
     args[0] = in0[idx];
     if (ET == ElementwiseType::kBinary) {
       args[1] = in1[idx];
     }
   }
 
-  inline __device__ void store_vector(VecType res, int idx) {
-    VecType *out_vec = reinterpret_cast<VecType *>(out);
+  inline __device__ void store_vector(OutVecType res, int idx) {
+    OutVecType *out_vec = reinterpret_cast<OutVecType *>(out);
     out_vec[idx] = res;
   }
 
-  inline __device__ void store_scalar(T res, int idx) { out[idx] = res; }
+  inline __device__ void store_scalar(OutT res, int idx) { out[idx] = res; }
 };
 
-template <ElementwiseType ET, int VecSize, typename T, typename Functor>
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
 __device__ void VectorizedKernelImpl(
-    ElementwiseDataWrapper<ET, VecSize, T> data, Functor func, int tid) {
-  using VecType = CudaAlignedVector<T, VecSize>;
-  VecType ins_vec[ET];
-  VecType out_vec;
-  T *ins_ptr[ET];
-  T *out_ptr;
+    ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
+    int tid) {
+  using InVecType = CudaAlignedVector<InT, VecSize>;
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+  InVecType ins_vec[ET];
+  OutVecType out_vec;
+  InT *ins_ptr[ET];
+  OutT *out_ptr;
 #pragma unroll
   for (int i = 0; i < ET; ++i) {
-    ins_ptr[i] = reinterpret_cast<T *>(&(ins_vec[i]));
+    ins_ptr[i] = reinterpret_cast<InT *>(&(ins_vec[i]));
   }
-  out_ptr = reinterpret_cast<T *>(&out_vec);
+  out_ptr = reinterpret_cast<OutT *>(&out_vec);
 
   // load
   data.load_vector(ins_vec, tid);
@@ -119,7 +123,7 @@ __device__ void VectorizedKernelImpl(
 // compute
 #pragma unroll
   for (int i = 0; i < VecSize; ++i) {
-    T ins[ET];
+    InT ins[ET];
 #pragma unroll
     for (int j = 0; j < ET; ++j) {
       ins[j] = ins_ptr[j][i];
@@ -131,11 +135,13 @@ __device__ void VectorizedKernelImpl(
   data.store_vector(out_vec, tid);
 }
 
-template <ElementwiseType ET, int VecSize, typename T, typename Functor>
-__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
-                                 Functor func, int start, int remain) {
-  T ins[ET];
-  T out;
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
+__device__ void ScalarKernelImpl(
+    ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
+    int start, int remain) {
+  InT ins[ET];
+  OutT out;
 
   for (int i = 0; i < remain; ++i) {
     int idx = start + i;
@@ -148,14 +154,15 @@ __device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
   }
 }
 
-template <ElementwiseType ET, int VecSize, typename T, typename Functor>
-__global__ void VectorizedKernel(const T *__restrict__ in0,
-                                 const T *__restrict__ in1, T *out, int size,
-                                 Functor func) {
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
+__global__ void VectorizedKernel(const InT *__restrict__ in0,
+                                 const InT *__restrict__ in1, OutT *out,
+                                 int size, Functor func) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int remain = size - VecSize * tid;
   remain = remain > 0 ? remain : 0;
-  auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
+  auto data = ElementwiseDataWrapper<ET, VecSize, InT, OutT>(out, in0, in1);
   if (remain >= VecSize) {
     VectorizedKernelImpl(data, func, tid);
   } else {
@@ -163,30 +170,31 @@ __global__ void VectorizedKernel(const T *__restrict__ in0,
   }
 }
 
-template <ElementwiseType ET, typename T, typename Functor>
-__global__ void ScalarKernel(const T *__restrict__ in0,
-                             const T *__restrict__ in1, T *out, int size,
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+__global__ void ScalarKernel(const InT *__restrict__ in0,
+                             const InT *__restrict__ in1, OutT *out, int size,
                              Functor func) {
-  auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
+  auto data = ElementwiseDataWrapper<ET, 1, InT, OutT>(out, in0, in1);
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int remain = tid < size ? 1 : 0;
   ScalarKernelImpl(data, func, tid, remain);
 }
 
-template <ElementwiseType ET, typename T, typename Functor>
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
 void LaunchElementwiseCudaKernel(
     const platform::CUDADeviceContext &ctx,
     const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, Functor func) {
   // calculate the max vec_size for all ins and outs
   auto size = ins[0]->numel();
-  int vec_size = GetVectorizedSize<T>(ins, *outs);
+  int vec_size = GetVectorizedSize<InT, OutT>(ins, *outs);
   int block_size = ELEMENTWISE_BLOCK_SIZE;
   int grid_size =
       ((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
-  const T *in0 = ins[0]->data<T>();
-  const T *in1 = (ET == ElementwiseType::kBinary) ? ins[1]->data<T>() : nullptr;
-  T *out = (*outs)[0]->data<T>();
+  const InT *in0 = ins[0]->data<InT>();
+  const InT *in1 =
+      (ET == ElementwiseType::kBinary) ? ins[1]->data<InT>() : nullptr;
+  OutT *out = (*outs)[0]->data<OutT>();
   // cuda kernel
   auto stream = ctx.stream();
   switch (vec_size) {
-- 
GitLab