From b79c6a9b3315742283a923ed52a5934c231beeff Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Fri, 25 Mar 2022 14:13:20 +0800
Subject: [PATCH] add cast_grad phi kernel (#40798)

* add cast_grad phi kernel

* refie unittest

* refien unittest

* refine unittest

* refine include header path

* refien xpu cast unittest

* refine code
---
 paddle/phi/kernels/cast_grad_kernel.h         | 27 ++++++++++
 paddle/phi/kernels/cpu/cast_grad_kernel.cc    | 49 +++++++++++++++++
 paddle/phi/kernels/cpu/cast_impl.h            | 47 ++++++++++++++++
 paddle/phi/kernels/cpu/cast_kernel.cc         | 29 +---------
 paddle/phi/kernels/gpu/cast_grad_kernel.cu    | 53 +++++++++++++++++++
 paddle/phi/kernels/gpu/cast_impl.h            | 42 +++++++++++++++
 paddle/phi/kernels/gpu/cast_kernel.cu         | 34 +-----------
 .../fluid/tests/unittests/test_cast_op.py     |  4 ++
 .../white_list/op_accuracy_white_list.py      |  3 +-
 .../tests/unittests/xpu/test_cast_op_xpu.py   |  1 +
 python/paddle/utils/code_gen/api.yaml         |  1 +
 python/paddle/utils/code_gen/backward.yaml    | 10 ++++
 12 files changed, 239 insertions(+), 61 deletions(-)
 create mode 100644 paddle/phi/kernels/cast_grad_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/cast_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/cast_impl.h
 create mode 100644 paddle/phi/kernels/gpu/cast_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/cast_impl.h
diff --git a/paddle/phi/kernels/cast_grad_kernel.h b/paddle/phi/kernels/cast_grad_kernel.h
new file mode 100644
index 0000000000..6b98bc88de
--- /dev/null
+++ b/paddle/phi/kernels/cast_grad_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CastGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/cast_grad_kernel.cc b/paddle/phi/kernels/cpu/cast_grad_kernel.cc
new file mode 100644
index 0000000000..c294c743bd
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cast_grad_kernel.cc
@@ -0,0 +1,49 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cast_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/cast_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CastGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    DenseTensor* x_grad) {
+  PD_VISIT_ALL_TYPES(x.dtype(), "CastKernelImpl", ([&] {
+                       CastKernelImpl<T, data_t>(dev_ctx, out_grad, x_grad);
+                     }));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cast_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CastGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   int16_t,
+                   bool,
+                   uint8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
diff --git a/paddle/phi/kernels/cpu/cast_impl.h b/paddle/phi/kernels/cpu/cast_impl.h
new file mode 100644
index 0000000000..d39ef24e7b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cast_impl.h
@@ -0,0 +1,47 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace phi {
+
+template <typename InT, typename OutT>
+struct CastOpTransformFunctor {
+  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
+};
+
+template <typename InT, typename OutT>
+void CastKernelImpl(const CPUContext& dev_ctx,
+                    const DenseTensor& x,
+                    DenseTensor* out) {
+  auto* in_begin = x.data<InT>();
+  auto numel = x.numel();
+  auto* in_end = in_begin + numel;
+
+  auto* out_begin = dev_ctx.Alloc<OutT>(out);
+
+  paddle::platform::Transform<CPUContext> trans;
+  trans(dev_ctx,
+        in_begin,
+        in_end,
+        out_begin,
+        CastOpTransformFunctor<InT, OutT>());
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/cast_kernel.cc b/paddle/phi/kernels/cpu/cast_kernel.cc
index c2c207bfaf..800962544c 100644
--- a/paddle/phi/kernels/cpu/cast_kernel.cc
+++ b/paddle/phi/kernels/cpu/cast_kernel.cc
@@ -13,39 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/cpu/cast_impl.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/transform.h"
-
 namespace phi {
 
-template <typename InT, typename OutT>
-struct CastOpTransformFunctor {
-  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
-};
-
-template <typename InT, typename OutT>
-void CastKernelImpl(const CPUContext& dev_ctx,
-                    const DenseTensor& x,
-                    DenseTensor* out) {
-  auto* in_begin = x.data<InT>();
-  auto numel = x.numel();
-  auto* in_end = in_begin + numel;
-
-  auto* out_begin = dev_ctx.Alloc<OutT>(out);
-
-  paddle::platform::Transform<CPUContext> trans;
-  trans(dev_ctx,
-        in_begin,
-        in_end,
-        out_begin,
-        CastOpTransformFunctor<InT, OutT>());
-}
-
 template <typename T, typename Context>
 void CastKernel(const Context& dev_ctx,
                 const DenseTensor& x,
diff --git a/paddle/phi/kernels/gpu/cast_grad_kernel.cu b/paddle/phi/kernels/gpu/cast_grad_kernel.cu
new file mode 100644
index 0000000000..1c1d8cf2c0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cast_grad_kernel.cu
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cast_grad_kernel.h"
+#include "paddle/phi/kernels/gpu/cast_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CastGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    DenseTensor* x_grad) {
+  PD_VISIT_ALL_TYPES(x.dtype(), "CastCUDAKernelImpl", ([&] {
+                       CastCUDAKernelImpl<T, data_t>(dev_ctx, out_grad, x_grad);
+                     }));
+}
+
+}  // namespace phi
+
+#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
+  PD_REGISTER_KERNEL(cast_grad,                         \
+                     GPU,                               \
+                     ALL_LAYOUT,                        \
+                     phi::CastGradKernel,               \
+                     float,                             \
+                     double,                            \
+                     int,                               \
+                     int64_t,                           \
+                     int16_t,                           \
+                     bool,                              \
+                     uint8_t,                           \
+                     phi::dtype::float16,               \
+                     phi::dtype::complex<float>,        \
+                     phi::dtype::complex<double>,       \
+                     ##__VA_ARGS__) {                   \
+    kernel->OutputAt(0).SetDataType(                    \
+        paddle::experimental::DataType::UNDEFINED);     \
+  }
+
+PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast_grad, phi::dtype::bfloat16)
diff --git a/paddle/phi/kernels/gpu/cast_impl.h b/paddle/phi/kernels/gpu/cast_impl.h
new file mode 100644
index 0000000000..8f6351e675
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cast_impl.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+namespace phi {
+
+template <typename InT, typename OutT>
+struct CastFuctor {
+  __device__ __forceinline__ OutT operator()(const InT x) const {
+    return static_cast<OutT>(x);
+  }
+};
+
+template <typename InT, typename OutT>
+void CastCUDAKernelImpl(const GPUContext& dev_ctx,
+                        const DenseTensor& x,
+                        DenseTensor* out) {
+  std::vector<const DenseTensor*> inputs;
+  std::vector<DenseTensor*> outputs;
+  inputs.emplace_back(&x);
+  outputs.emplace_back(out);
+  dev_ctx.Alloc<OutT>(out);
+  phi::funcs::ElementwiseKernel<OutT>(
+      dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>());
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index 542234c80b..7c4cadbc90 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -12,42 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cast_kernel.h"
-
-#include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/gpu/cast_impl.h"
 
 namespace phi {
 
-template <typename InT, typename OutT>
-struct CastFuctor {
-  __device__ __forceinline__ OutT operator()(const InT x) const {
-    return static_cast<OutT>(x);
-  }
-};
-
-template <typename InT, typename OutT>
-void CastCUDAKernelImpl(const GPUContext& dev_ctx,
-                        const DenseTensor& x,
-                        DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  std::vector<DenseTensor*> outputs;
-  inputs.emplace_back(&x);
-  outputs.emplace_back(out);
-  dev_ctx.Alloc<OutT>(out);
-  phi::funcs::ElementwiseKernel<OutT>(
-      dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>());
-}
-
 template <typename T, typename Context>
 void CastKernel(const Context& dev_ctx,
                 const DenseTensor& x,
diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
index 988e7df41a..d80a9dc920 100644
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
@@ -52,6 +52,7 @@ class TestCastOpFp16ToFp32(OpTest):
             'out_dtype': int(core.VarDesc.VarType.FP32)
         }
         self.op_type = 'cast'
+        self.__class__.no_need_check_grad = True
 
     def test_check_output(self):
         self.check_output(atol=1e-3)
@@ -67,6 +68,7 @@ class TestCastOpFp32ToFp16(OpTest):
             'out_dtype': int(core.VarDesc.VarType.FP16)
         }
         self.op_type = 'cast'
+        self.__class__.no_need_check_grad = True
 
     def test_check_output(self):
         self.check_output(atol=1e-3)
@@ -82,6 +84,7 @@ class TestCastOpBf16ToFp32(OpTest):
             'out_dtype': int(core.VarDesc.VarType.FP32)
         }
         self.op_type = 'cast'
+        self.__class__.no_need_check_grad = True
 
     def test_check_output(self):
         self.check_output()
@@ -97,6 +100,7 @@ class TestCastOpFp32ToBf16(OpTest):
             'out_dtype': int(core.VarDesc.VarType.BF16)
         }
         self.op_type = 'cast'
+        self.__class__.no_need_check_grad = True
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
index 581656f6cd..9b57b0d824 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
@@ -76,7 +76,8 @@ NO_FP64_CHECK_GRAD_OP_LIST = [
     'trilinear_interp_v2', \
     'var_conv_2d', \
     'warpctc', \
-    'bilateral_slice'
+    'bilateral_slice', \
+    'cast'
 ]
 
 NO_FP16_CHECK_GRAD_OP_LIST = [
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
index 2447408296..08d4810a65 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
@@ -44,6 +44,7 @@ def create_test_class(in_typename, out_typename):
                 'out_dtype': typeid_dict[out_typename],
             }
             self.op_type = 'cast'
+            self.__class__.no_need_check_grad = True
 
         def test_check_output(self):
             if paddle.is_compiled_with_xpu():
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index e1b13ab440..11e8d67ca4 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -16,6 +16,7 @@
     func : cast
     param : [x, out_dtype]
     data_type : x
+  backward : cast_grad
 
 
 - api : concat
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index ff5ebd6ef6..6b597ce44a 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -307,6 +307,16 @@
   kernel :
     func : mv_grad
 
+- backward_api : cast_grad
+  forward : cast (Tensor x, DataType out_dtype) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : cast_grad
+    data_type : out_grad
 
 
 #  =================================== sep0
-- 
GitLab