Support different data type between input and output (#32823)

3419de53 · Zhang Zheng · GitHub · fbbc3394 · 3419de53 · 3419de53
4 changed file
--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
@@ -13,44 +13,79 @@
 // limitations under the License.

 #include "paddle/fluid/operators/abs_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"

+namespace paddle {
+namespace operators {
+
+template <typename T, typename Enable = void>
+struct CudaAbsFunctor;
+
+template <typename T>
+struct CudaAbsFunctor<T, math::Complex<T, math::Real<T>>> {
+  __device__ __forceinline__ math::Real<T> operator()(const T* args) const {
+    return abs(args[0]);
+  }
+};
+
+template <typename T>
+struct CudaAbsFunctor<T, math::NoComplex<T, math::Real<T>>> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return std::abs(args[0]);
+  }
+};
+
+template <typename T>
+class AbsKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    out->mutable_data<math::Real<T>>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    std::vector<const framework::Tensor*> ins = {x};
+    std::vector<framework::Tensor*> outs = {out};
+    auto functor = CudaAbsFunctor<T>();
+    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, math::Real<T>>(
+        dev_ctx, ins, &outs, functor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
 REGISTER_OP_CUDA_KERNEL(
-    abs, ops::AbsKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex64>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex128>);
+    abs, ops::AbsKernel<plat::CUDADeviceContext, float>,
+    ops::AbsKernel<plat::CUDADeviceContext, double>,
+    ops::AbsKernel<plat::CUDADeviceContext, int>,
+    ops::AbsKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::complex64>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::complex128>);

 REGISTER_OP_CUDA_KERNEL(
-    abs_grad, ops::AbsGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex64>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex128>);
+    abs_grad, ops::AbsGradKernel<plat::CUDADeviceContext, float>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, double>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, int>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex64>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex128>);

 REGISTER_OP_CUDA_KERNEL(
-    abs_grad_grad,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::float16>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::complex64>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::complex128>);
+    abs_grad_grad, ops::AbsDoubleGradKernel<plat::CUDADeviceContext, float>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, double>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex64>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex128>);
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1315,8 +1315,8 @@ class ActivationCudaKernel
    for (auto& attr : attrs) {
      *attr.second = ctx.Attr<float>(attr.first);
    }
-    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T>(dev_ctx, ins, &outs,
-                                                            functor);
+    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(dev_ctx, ins,
+                                                               &outs, functor);
  }
 };

@@ -1345,17 +1345,17 @@ class ActivationGradCudaKernel
    if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
      // Only need forward output Out
      ins.push_back(out);
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(dev_ctx, ins,
-                                                               &outs, functor);
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, functor);
    } else if (static_cast<int>(Functor::FwdDeps()) ==
               static_cast<int>(kDepX)) {
      // Only need forward input X
      ins.push_back(x);
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(dev_ctx, ins,
-                                                               &outs, functor);
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, functor);
    } else {
-      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T>(dev_ctx, ins,
-                                                              &outs, functor);
+      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+          dev_ctx, ins, &outs, functor);
    }
  }
 };

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -45,7 +45,7 @@ struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
                  framework::Tensor* z) {
    std::vector<const framework::Tensor*> ins = {x, y};
    std::vector<framework::Tensor*> outs = {z};
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
        ctx.template device_context<platform::CUDADeviceContext>(), ins, &outs,
        CudaAddFunctor<T>());
  }

--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -49,69 +49,73 @@ int GetVectorizedSizeImpl(const T *pointer) {
  return 1;
 }

-template <typename T>
+template <typename InT, typename OutT>
 int GetVectorizedSize(const std::vector<const framework::Tensor *> &ins,
                      const std::vector<framework::Tensor *> &outs) {
  int vec_size = 4;
  for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
    vec_size =
-        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<T>()));
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<InT>()));
  }
  for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
    vec_size =
-        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<T>()));
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<OutT>()));
  }
  return vec_size;
 }

-template <ElementwiseType ET, int VecSize, typename T>
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT>
 struct ElementwiseDataWrapper {
-  T *out;
-  const T *in0;
-  const T *in1;
-  __device__ ElementwiseDataWrapper(T *out, const T *in0,
-                                    const T *in1 = nullptr)
+  OutT *out;
+  const InT *in0;
+  const InT *in1;
+  __device__ ElementwiseDataWrapper(OutT *out, const InT *in0,
+                                    const InT *in1 = nullptr)
      : out(out), in0(in0), in1(in1) {}

-  using VecType = CudaAlignedVector<T, VecSize>;
+  using InVecType = CudaAlignedVector<InT, VecSize>;
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;

-  inline __device__ void load_vector(VecType args[], int idx) {
-    const VecType *x_vec = reinterpret_cast<const VecType *>(in0);
+  inline __device__ void load_vector(InVecType args[], int idx) {
+    const InVecType *x_vec = reinterpret_cast<const InVecType *>(in0);
    args[0] = x_vec[idx];
    if (ET == ElementwiseType::kBinary) {
-      const VecType *y_vec = reinterpret_cast<const VecType *>(in1);
+      const InVecType *y_vec = reinterpret_cast<const InVecType *>(in1);
      args[1] = y_vec[idx];
    }
  }

-  inline __device__ void load_scalar(T args[], int idx) {
+  inline __device__ void load_scalar(InT args[], int idx) {
    args[0] = in0[idx];
    if (ET == ElementwiseType::kBinary) {
      args[1] = in1[idx];
    }
  }

-  inline __device__ void store_vector(VecType res, int idx) {
-    VecType *out_vec = reinterpret_cast<VecType *>(out);
+  inline __device__ void store_vector(OutVecType res, int idx) {
+    OutVecType *out_vec = reinterpret_cast<OutVecType *>(out);
    out_vec[idx] = res;
  }

-  inline __device__ void store_scalar(T res, int idx) { out[idx] = res; }
+  inline __device__ void store_scalar(OutT res, int idx) { out[idx] = res; }
 };

-template <ElementwiseType ET, int VecSize, typename T, typename Functor>
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
 __device__ void VectorizedKernelImpl(
-    ElementwiseDataWrapper<ET, VecSize, T> data, Functor func, int tid) {
-  using VecType = CudaAlignedVector<T, VecSize>;
-  VecType ins_vec[ET];
-  VecType out_vec;
-  T *ins_ptr[ET];
-  T *out_ptr;
+    ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
+    int tid) {
+  using InVecType = CudaAlignedVector<InT, VecSize>;
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+  InVecType ins_vec[ET];
+  OutVecType out_vec;
+  InT *ins_ptr[ET];
+  OutT *out_ptr;
 #pragma unroll
  for (int i = 0; i < ET; ++i) {
-    ins_ptr[i] = reinterpret_cast<T *>(&(ins_vec[i]));
+    ins_ptr[i] = reinterpret_cast<InT *>(&(ins_vec[i]));
  }
-  out_ptr = reinterpret_cast<T *>(&out_vec);
+  out_ptr = reinterpret_cast<OutT *>(&out_vec);

  // load
  data.load_vector(ins_vec, tid);
@@ -119,7 +123,7 @@ __device__ void VectorizedKernelImpl(
 // compute
 #pragma unroll
  for (int i = 0; i < VecSize; ++i) {
-    T ins[ET];
+    InT ins[ET];
 #pragma unroll
    for (int j = 0; j < ET; ++j) {
      ins[j] = ins_ptr[j][i];
@@ -131,11 +135,13 @@ __device__ void VectorizedKernelImpl(
  data.store_vector(out_vec, tid);
 }

-template <ElementwiseType ET, int VecSize, typename T, typename Functor>
-__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
-                                 Functor func, int start, int remain) {
-  T ins[ET];
-  T out;
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
+__device__ void ScalarKernelImpl(
+    ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
+    int start, int remain) {
+  InT ins[ET];
+  OutT out;

  for (int i = 0; i < remain; ++i) {
    int idx = start + i;
@@ -148,14 +154,15 @@ __device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
  }
 }

-template <ElementwiseType ET, int VecSize, typename T, typename Functor>
-__global__ void VectorizedKernel(const T *__restrict__ in0,
-                                 const T *__restrict__ in1, T *out, int size,
-                                 Functor func) {
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
+__global__ void VectorizedKernel(const InT *__restrict__ in0,
+                                 const InT *__restrict__ in1, OutT *out,
+                                 int size, Functor func) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int remain = size - VecSize * tid;
  remain = remain > 0 ? remain : 0;
-  auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
+  auto data = ElementwiseDataWrapper<ET, VecSize, InT, OutT>(out, in0, in1);
  if (remain >= VecSize) {
    VectorizedKernelImpl(data, func, tid);
  } else {
@@ -163,30 +170,31 @@ __global__ void VectorizedKernel(const T *__restrict__ in0,
  }
 }

-template <ElementwiseType ET, typename T, typename Functor>
-__global__ void ScalarKernel(const T *__restrict__ in0,
-                             const T *__restrict__ in1, T *out, int size,
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+__global__ void ScalarKernel(const InT *__restrict__ in0,
+                             const InT *__restrict__ in1, OutT *out, int size,
                             Functor func) {
-  auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
+  auto data = ElementwiseDataWrapper<ET, 1, InT, OutT>(out, in0, in1);
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int remain = tid < size ? 1 : 0;
  ScalarKernelImpl(data, func, tid, remain);
 }

-template <ElementwiseType ET, typename T, typename Functor>
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
 void LaunchElementwiseCudaKernel(
    const platform::CUDADeviceContext &ctx,
    const std::vector<const framework::Tensor *> &ins,
    std::vector<framework::Tensor *> *outs, Functor func) {
  // calculate the max vec_size for all ins and outs
  auto size = ins[0]->numel();
-  int vec_size = GetVectorizedSize<T>(ins, *outs);
+  int vec_size = GetVectorizedSize<InT, OutT>(ins, *outs);
  int block_size = ELEMENTWISE_BLOCK_SIZE;
  int grid_size =
      ((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
-  const T *in0 = ins[0]->data<T>();
-  const T *in1 = (ET == ElementwiseType::kBinary) ? ins[1]->data<T>() : nullptr;
-  T *out = (*outs)[0]->data<T>();
+  const InT *in0 = ins[0]->data<InT>();
+  const InT *in1 =
+      (ET == ElementwiseType::kBinary) ? ins[1]->data<InT>() : nullptr;
+  OutT *out = (*outs)[0]->data<OutT>();
  // cuda kernel
  auto stream = ctx.stream();
  switch (vec_size) {