fix cast cuda implementation (#36679)

4d3c7f33 · Zeng Jinle · GitHub · bdcc2ad4 · 4d3c7f33
显示空白变更内容
内联并排

Showing with 33 addition and 31 deletion

paddle/fluid/operators/cast_op.cu paddle/fluid/operators/cast_op.cu +33 -31

未找到文件。
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -47,11 +47,11 @@ __global__ void CastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
 }

 template <typename InT>
-struct CastOpFunctor<platform::CUDADeviceContext, InT> {
+struct CastCUDAOpFunctor {
  const framework::Tensor* in_;
  framework::Tensor* out_;
  const platform::CUDADeviceContext& ctx_;
-  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
+  CastCUDAOpFunctor(const framework::Tensor* in, framework::Tensor* out,
                    const platform::CUDADeviceContext& ctx)
      : in_(in), out_(out), ctx_(ctx) {}

@@ -75,6 +75,21 @@ struct CastOpFunctor<platform::CUDADeviceContext, InT> {
  }
 };

+template <typename InT>
+class CastCUDAOpKernel : public framework::OpKernel<InT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    framework::VisitDataType(
+        static_cast<framework::proto::VarType::Type>(
+            context.Attr<int>("out_dtype")),
+        CastCUDAOpFunctor<InT>(
+            in, out,
+            context.template device_context<platform::CUDADeviceContext>()));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -82,34 +97,21 @@ namespace ops = paddle::operators;

 #ifdef PADDLE_WITH_HIP
 REGISTER_OP_CUDA_KERNEL(
-    cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<float>>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<double>>);
+    cast, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>,
+    ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,
+    ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,
+    ops::CastCUDAOpKernel<uint8_t>,
+    ops::CastCUDAOpKernel<paddle::platform::float16>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<float>>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<double>>);
 #else
 REGISTER_OP_CUDA_KERNEL(
-    cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::bfloat16>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<float>>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<double>>);
+    cast, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>,
+    ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,
+    ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,
+    ops::CastCUDAOpKernel<uint8_t>,
+    ops::CastCUDAOpKernel<paddle::platform::float16>,
+    ops::CastCUDAOpKernel<paddle::platform::bfloat16>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<float>>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<double>>);
 #endif