gather scatter with cuda streams

84b8baf1 · zchen0211 · 15941dbd · 84b8baf1 · 84b8baf1 · 84b8baf1
4 changed file
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
@@ -46,9 +46,9 @@ __global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
 * return: output tensor
 */
 template <typename T>
-void GPUGather(const Place& place, const Tensor* src, const Tensor* index,
+void GPUGather(const platform::DeviceContext& ctx, const Tensor* src,
-               Tensor* output) {
+               const Tensor* index, Tensor* output) {
-  PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
  // check index of shape 1-D
  PADDLE_ENFORCE(index->dims().size() == 1);
  int index_size = index->dims()[0];
@@ -68,8 +68,11 @@ void GPUGather(const Place& place, const Tensor* src, const Tensor* index,
  int block = 512;
  int n = slice_size * index_size;
  int grid = (n + block - 1) / block;
-  GatherCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
-                                       slice_size);
+  GatherCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
 }
 }  // namespace operators

--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -32,7 +32,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
    output->mutable_data<T>(ctx.GetPlace());
-    GPUGather<T>(ctx.GetPlace(), x, index, output);
+    GPUGather<T>(ctx.device_context(), x, index, output);
  }
 };
@@ -42,7 +42,6 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "This kernel only runs on GPU device.");
-    LOG(INFO) << "Gather grad here";
    auto *Index = ctx.Input<Tensor>("Index");
    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -53,7 +52,7 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
    auto place = ctx.GetEigenDevice<platform::GPUPlace>();
    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    GPUScatterAssign<T>(ctx.GetPlace(), dO, Index, dX);
+    GPUScatterAssign<T>(ctx.device_context(), dO, Index, dX);
  }
 };

--- a/paddle/operators/scatter.cu.h
+++ b/paddle/operators/scatter.cu.h
@@ -45,11 +45,11 @@ __global__ void ScatterCUDAKernel(const T* params, const int* indices,
 * return: output tensor
 */
 template <typename T>
-void GPUScatterAssign(const platform::Place& place,
+void GPUScatterAssign(const platform::DeviceContext& ctx,
                      const paddle::framework::Tensor* src,
                      const paddle::framework::Tensor* index,
                      paddle::framework::Tensor* output) {
-  PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
  // check index of shape 1-D
  PADDLE_ENFORCE(index->dims().size() == 1);
  int index_size = index->dims()[0];
@@ -70,8 +70,10 @@ void GPUScatterAssign(const platform::Place& place,
  int n = slice_size * index_size;
  int grid = (n + block - 1) / block;
-  ScatterCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
+  ScatterCUDAKernel<T><<<
-                                        slice_size);
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
 }
 }  // namespace operators

--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -32,7 +32,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
    Out->ShareDataWith<T>(*Ref);
-    GPUScatterAssign<T>(ctx.GetPlace(), Updates, Index, Out);
+    GPUScatterAssign<T>(ctx.device_context(), Updates, Index, Out);
  }
 };
@@ -51,7 +51,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
    dRef->ShareDataWith<T>(*dOut);
    dUpdates->mutable_data<T>(ctx.GetPlace());
    // Gradient by Gather: dUpdates = dO[Index]
-    GPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+    GPUGather<T>(ctx.device_context(), dOut, Index, dUpdates);
  }
 };