From 9814f89551e2133c6733352f6445d4d668da6f63 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Fri, 8 Oct 2021 10:47:13 +0800
Subject: [PATCH] fix cast cuda implementation (#36266)

---
 paddle/fluid/operators/cast_op.cu | 64 ++++++++++++++++---------------
 1 file changed, 33 insertions(+), 31 deletions(-)
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 06300817e0..601735c2f1 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -47,12 +47,12 @@ __global__ void CastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
 }
 
 template <typename InT>
-struct CastOpFunctor<platform::CUDADeviceContext, InT> {
+struct CastCUDAOpFunctor {
   const framework::Tensor* in_;
   framework::Tensor* out_;
   const platform::CUDADeviceContext& ctx_;
-  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
-                const platform::CUDADeviceContext& ctx)
+  CastCUDAOpFunctor(const framework::Tensor* in, framework::Tensor* out,
+                    const platform::CUDADeviceContext& ctx)
       : in_(in), out_(out), ctx_(ctx) {}
 
   template <typename OutT>
@@ -75,6 +75,21 @@ struct CastOpFunctor<platform::CUDADeviceContext, InT> {
   }
 };
 
+template <typename InT>
+class CastCUDAOpKernel : public framework::OpKernel<InT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    framework::VisitDataType(
+        static_cast<framework::proto::VarType::Type>(
+            context.Attr<int>("out_dtype")),
+        CastCUDAOpFunctor<InT>(
+            in, out,
+            context.template device_context<platform::CUDADeviceContext>()));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -82,34 +97,21 @@ namespace ops = paddle::operators;
 
 #ifdef PADDLE_WITH_HIP
 REGISTER_OP_CUDA_KERNEL(
-    cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<float>>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<double>>);
+    cast, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>,
+    ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,
+    ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,
+    ops::CastCUDAOpKernel<uint8_t>,
+    ops::CastCUDAOpKernel<paddle::platform::float16>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<float>>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<double>>);
 #else
 REGISTER_OP_CUDA_KERNEL(
-    cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::bfloat16>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<float>>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<double>>);
+    cast, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>,
+    ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,
+    ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,
+    ops::CastCUDAOpKernel<uint8_t>,
+    ops::CastCUDAOpKernel<paddle::platform::float16>,
+    ops::CastCUDAOpKernel<paddle::platform::bfloat16>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<float>>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<double>>);
 #endif
-- 
GitLab