Optimize elementwise_add_grad op, test=develop (#32051)

1e52f324 · jiangcheng · GitHub · 36687d7a · 1e52f324
隐藏空白更改
内联并排

Showing with 33 addition and 12 deletion

paddle/fluid/operators/elementwise/elementwise_add_op.cu paddle/fluid/operators/elementwise/elementwise_add_op.cu +33 -12

未找到文件。
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -112,18 +112,39 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
                     const framework::Tensor* out,
                     const framework::Tensor* dout, framework::Tensor* dx,
                     framework::Tensor* dy) {
-  auto size = x->numel();
+  auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-  int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
+  auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
-  dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
+  auto* dout_data = dout->data<T>();
-  dim3 grid_size =
+  if (dx_data == dout_data && dy_data != dout_data) {
-      dim3(((size + vec_size - 1) / vec_size + PADDLE_CUDA_THREAD_SIZE - 1) /
+    VLOG(4) << "Special case when dx_data is the same as dout_data, "
-               PADDLE_CUDA_THREAD_SIZE,
+               "only need copy dout to dy";
-           1);
+    framework::TensorCopy(
-  SimpleElemwiseAddGradCUDAKernel<
+        *dout, ctx.GetPlace(),
-      T><<<grid_size, block_size, 0,
+        ctx.template device_context<platform::DeviceContext>(), dy);
-           ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
+  } else if (dx_data != dout_data && dy_data == dout_data) {
-      dout->data<T>(), size, vec_size, dx->mutable_data<T>(ctx.GetPlace()),
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
-      dy->mutable_data<T>(ctx.GetPlace()));
+               "only need copy dout to dx";
+    framework::TensorCopy(
+        *dout, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), dx);
+  } else if (dx_data != dout_data && dy_data != dout_data) {
+    auto size = x->numel();
+    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
+    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
+    dim3 grid_size =
+        dim3(((size + vec_size - 1) / vec_size + PADDLE_CUDA_THREAD_SIZE - 1) /
+                 PADDLE_CUDA_THREAD_SIZE,
+             1);
+    SimpleElemwiseAddGradCUDAKernel<
+        T><<<grid_size, block_size, 0,
+             ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
+        dout->data<T>(), size, vec_size, dx->mutable_data<T>(ctx.GetPlace()),
+        dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "and dx_data is the same as dout_data, do not need "
+               "any operator";
+  }
 }
 }  // namespace operators