1 api

78808b20 · zchen0211 · b851515b · 78808b20 · 78808b20 · 78808b20
10 changed file
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -169,7 +169,7 @@ void CondOp::Run(const Scope& scope,
      tensor_child->Resize(dim);
      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
-      CPUTGather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
+      CPUGather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
                       tensor_child);
    }
  }

--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
@@ -38,19 +38,6 @@ __global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
  }
 }
-// Implementation of GPU copy:
-template <typename T>
-struct GPUGather {
-  void operator()(const T* src, const int* index, const int slice_size,
-                  const int index_size, T* output) {
-    int block = 512;
-    int n = slice_size * index_size;
-    int grid = (n + block - 1) / block;
-    GatherCUDAKernel<T><<<grid, block>>>(src, index, output, index_size,
-                                         slice_size);
-  }
-};
 /**
 * A thin wrapper on gpu tensor
 * Return a new tensor from source tensor, gathered according to index
@@ -59,7 +46,7 @@ struct GPUGather {
 * return: output tensor
 */
 template <typename T>
-void GPUTGather(const Place& place, const Tensor* src, const Tensor* index,
+void GPUGather(const Place& place, const Tensor* src, const Tensor* index,
               Tensor* output) {
  PADDLE_ENFORCE(platform::is_gpu_place(place));
  // check index of shape 1-D
@@ -74,10 +61,15 @@ void GPUTGather(const Place& place, const Tensor* src, const Tensor* index,
  int slice_size = 1;
  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-  // Gathering
+  const T* p_src = src->data<T>();
-  GPUGather<T> gather_functor;
+  const int* p_index = index->data<int>();
-  gather_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
+  T* p_output = output->data<T>();
-                 output->data<T>());
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+  GatherCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
+                                       slice_size);
 }
 }  // namespace operators

--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -24,29 +24,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-// Implementation of CPU copy
-template <typename T>
-struct CPUGather {
-  void operator()(const T* src, const int* indices, const int slice_size,
-                  const int index_size, T* output) {
-    const size_t slice_bytes = slice_size * sizeof(T);
-    for (int i = 0; i < index_size; ++i) {
-      int index_ = indices[i];
-      memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
-    }
-  }
-};
 /**
- * A thin wrapper on cpu tensor
+ * A thin wrapper for gathering on cpu tensor
 * Return a new tensor from source tensor, gathered according to index
 * input[src]: type-T source Tensor
 * input[index]: type-int index Tensor (1-D)
 * return: output tensor
 */
 template <typename T>
-void CPUTGather(const platform::Place& place,
+void CPUGather(const platform::Place& place,
               const paddle::framework::Tensor* src,
               const paddle::framework::Tensor* index,
               paddle::framework::Tensor* output) {
@@ -59,14 +45,20 @@ void CPUTGather(const platform::Place& place,
  framework::DDim output_dims(src_dims);
  output_dims[0] = index_size;
+  const T* p_src = src->data<T>();
+  const int* p_index = index->data<int>();
+  T* p_output = output->data<T>();
  // slice size
  int slice_size = 1;
  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-  // Gathering
+  const size_t slice_bytes = slice_size * sizeof(T);
-  CPUGather<T> gather_functor;
-  gather_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
+  for (int i = 0; i < index_size; ++i) {
-                 output->data<T>());
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
+  }
 }
 }  // namespace operators

--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -32,7 +32,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
    output->mutable_data<T>(ctx.GetPlace());
-    GPUTGather<T>(ctx.GetPlace(), x, index, output);
+    GPUGather<T>(ctx.GetPlace(), x, index, output);
  }
 };
@@ -53,7 +53,7 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
    auto place = ctx.GetEigenDevice<platform::GPUPlace>();
    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    GPUTScatter<T>(ctx.GetPlace(), dO, Index, dX);
+    GPUScatterAssign<T>(ctx.GetPlace(), dO, Index, dX);
  }
 };

--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -36,7 +36,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
    output->mutable_data<T>(ctx.GetPlace());
-    CPUTGather<T>(ctx.GetPlace(), x, index, output);
+    CPUGather<T>(ctx.GetPlace(), x, index, output);
  }
 };

--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
@@ -41,7 +41,7 @@ TEST(Gather, GatherData) {
  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
-  CPUTGather<int>(CPUPlace(), src, index, output);
+  CPUGather<int>(CPUPlace(), src, index, output);
  for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
  for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);

--- a/paddle/operators/scatter.cu.h
+++ b/paddle/operators/scatter.cu.h
@@ -36,20 +36,6 @@ __global__ void ScatterCUDAKernel(const T* params, const int* indices,
  }
 }
-// Implementation of GPU copy:
-template <typename T>
-struct GPUScatterAssign {
-  void operator()(const T* src, const int* index, const int slice_size,
-                  const int index_size, T* output) {
-    int block = 512;
-    int n = slice_size * index_size;
-    int grid = (n + block - 1) / block;
-    // printf("grid, block: %d %d\n", grid, block);
-    ScatterCUDAKernel<T><<<grid, block>>>(src, index, output, index_size,
-                                          slice_size);
-  }
-};
 /**
 * A thin wrapper on gpu tensor
 * Return a new updated tensor from source tensor, scatter-assigned according to
@@ -59,7 +45,7 @@ struct GPUScatterAssign {
 * return: output tensor
 */
 template <typename T>
-void GPUTScatter(const platform::Place& place,
+void GPUScatterAssign(const platform::Place& place,
                      const paddle::framework::Tensor* src,
                      const paddle::framework::Tensor* index,
                      paddle::framework::Tensor* output) {
@@ -76,10 +62,16 @@ void GPUTScatter(const platform::Place& place,
  int slice_size = 1;
  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-  // Scatter Assign
+  const T* p_src = src->data<T>();
-  GPUScatterAssign<T> scatter_functor;
+  const int* p_index = index->data<int>();
-  scatter_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
+  T* p_output = output->data<T>();
-                  output->data<T>());
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+  ScatterCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
+                                        slice_size);
 }
 }  // namespace operators

--- a/paddle/operators/scatter.h
+++ b/paddle/operators/scatter.h
@@ -25,19 +25,6 @@ namespace operators {
 using Tensor = framework::Tensor;
-// Implementation of CPU copy
-template <typename T>
-void CPUScatterAssign(const T* src, const int* index, const int slice_size,
-                      const int index_size, T* output) {
-  // paddle::framework::DDim output_dims = output->dims();
-  const size_t slice_bytes = slice_size * sizeof(T);
-  for (int i = 0; i < index_size; ++i) {
-    int index_ = index[i];
-    memcpy(output + index_ * slice_size, src + i * slice_size, slice_bytes);
-  }
-}
 /**
 * Return a updated tensor from source tensor, scattered according to index:
 * dst[i] = src[index[i]]
@@ -70,7 +57,12 @@ void ScatterAssign(const platform::Place& place,
  size_t slice_size = 1;
  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-  CPUScatterAssign<T>(p_src, p_index, slice_size, index_size, p_output);
+  const size_t slice_bytes = slice_size * sizeof(T);
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
+  }
 }
 }  // namespace operators

--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -32,7 +32,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
    Out->ShareDataWith<T>(*Ref);
-    GPUTScatter<T>(ctx.GetPlace(), Updates, Index, Out);
+    GPUScatterAssign<T>(ctx.GetPlace(), Updates, Index, Out);
  }
 };
@@ -51,7 +51,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
    dRef->ShareDataWith<T>(*dOut);
    dUpdates->mutable_data<T>(ctx.GetPlace());
    // Gradient by Gather: dUpdates = dO[Index]
-    GPUTGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+    GPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
  }
 };

--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
@@ -56,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
    dRef->ShareDataWith<T>(*dOut);
    dUpdates->mutable_data<T>(ctx.GetPlace());
    // Gradient by Gather: dUpdates += dO[Index]
-    CPUTGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+    CPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
  }
 };