From 84b8baf1967e327712269e7632235438d09759d9 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Mon, 2 Oct 2017 15:50:24 -0700
Subject: [PATCH] gather scatter with cuda streams

---
 paddle/operators/gather.cu.h   | 13 ++++++++-----
 paddle/operators/gather_op.cu  |  5 ++---
 paddle/operators/scatter.cu.h  | 10 ++++++----
 paddle/operators/scatter_op.cu |  4 ++--
 4 files changed, 18 insertions(+), 14 deletions(-)
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
index b400c104407..2ae11376a2b 100644
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
@@ -46,9 +46,9 @@ __global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
  * return: output tensor
  */
 template <typename T>
-void GPUGather(const Place& place, const Tensor* src, const Tensor* index,
-               Tensor* output) {
-  PADDLE_ENFORCE(platform::is_gpu_place(place));
+void GPUGather(const platform::DeviceContext& ctx, const Tensor* src,
+               const Tensor* index, Tensor* output) {
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
   PADDLE_ENFORCE(index->dims().size() == 1);
   int index_size = index->dims()[0];
@@ -68,8 +68,11 @@ void GPUGather(const Place& place, const Tensor* src, const Tensor* index,
   int block = 512;
   int n = slice_size * index_size;
   int grid = (n + block - 1) / block;
-  GatherCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
-                                       slice_size);
+
+  GatherCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
 }
 
 }  // namespace operators
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
index 06004614b2c..9937be5915d 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -32,7 +32,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
 
-    GPUGather<T>(ctx.GetPlace(), x, index, output);
+    GPUGather<T>(ctx.device_context(), x, index, output);
   }
 };
 
@@ -42,7 +42,6 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
-    LOG(INFO) << "Gather grad here";
     auto *Index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -53,7 +52,7 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     auto place = ctx.GetEigenDevice<platform::GPUPlace>();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
-    GPUScatterAssign<T>(ctx.GetPlace(), dO, Index, dX);
+    GPUScatterAssign<T>(ctx.device_context(), dO, Index, dX);
   }
 };
 
diff --git a/paddle/operators/scatter.cu.h b/paddle/operators/scatter.cu.h
index add4791a793..f4a3965d94c 100644
--- a/paddle/operators/scatter.cu.h
+++ b/paddle/operators/scatter.cu.h
@@ -45,11 +45,11 @@ __global__ void ScatterCUDAKernel(const T* params, const int* indices,
  * return: output tensor
  */
 template <typename T>
-void GPUScatterAssign(const platform::Place& place,
+void GPUScatterAssign(const platform::DeviceContext& ctx,
                       const paddle::framework::Tensor* src,
                       const paddle::framework::Tensor* index,
                       paddle::framework::Tensor* output) {
-  PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
   PADDLE_ENFORCE(index->dims().size() == 1);
   int index_size = index->dims()[0];
@@ -70,8 +70,10 @@ void GPUScatterAssign(const platform::Place& place,
   int n = slice_size * index_size;
   int grid = (n + block - 1) / block;
 
-  ScatterCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
-                                        slice_size);
+  ScatterCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
 }
 
 }  // namespace operators
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
index 831eabdae4f..6d13a876f98 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -32,7 +32,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
 
     Out->ShareDataWith<T>(*Ref);
 
-    GPUScatterAssign<T>(ctx.GetPlace(), Updates, Index, Out);
+    GPUScatterAssign<T>(ctx.device_context(), Updates, Index, Out);
   }
 };
 
@@ -51,7 +51,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
     dRef->ShareDataWith<T>(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates = dO[Index]
-    GPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+    GPUGather<T>(ctx.device_context(), dOut, Index, dUpdates);
   }
 };
 
-- 
GitLab