fix the mean grad OP performance improvement test=develop (#21658)

5eec8cf5 · wangchaochaohu · GitHub · 29f64c8c · 5eec8cf5
隐藏空白更改
内联并排

Showing with 9 addition and 6 deletion

paddle/fluid/operators/mean_op.cu paddle/fluid/operators/mean_op.cu +9 -6

未找到文件。
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -31,10 +31,11 @@ struct DivideFunctor {
 };
 template <typename T>
-__global__ void MeanRunKernel(const T in_data, T* out_data, int N) {
+__global__ void MeanRunKernel(const T* in_data, T* out_data, int N) {
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  T data = in_data[0];
  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    out_data[idx] = in_data / (static_cast<T>(N));
+    out_data[idx] = data / (static_cast<T>(N));
  }
 }
@@ -85,7 +86,7 @@ class MeanCUDAGradKernel : public framework::OpKernel<T> {
    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
    IG->mutable_data<T>(context.GetPlace());
-    T in_data = OG[0];
+    auto in_data = OG->data<T>();
    auto size_prob = IG->numel();
    auto out_data = IG->data<T>();
    int threads = 512;
@@ -105,6 +106,8 @@ REGISTER_OP_CUDA_KERNEL(
    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, double>,
    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
-    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
+    mean_grad,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext,
+                            plat::float16>);