scatter.cu.h 8.7 KB
Newer Older
1
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Z
zchen0211 已提交
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Z
zchen0211 已提交
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
Z
zchen0211 已提交
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Z
zchen0211 已提交
14 15

#pragma once
16
#include <unordered_set>
17
#include <vector>
18
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
19
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
20 21
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
22
#include "paddle/phi/kernels/funcs/math_function.h"
Z
zchen0211 已提交
23

24 25
namespace phi {
namespace funcs {
26

27
template <typename T, typename IndexT = int>
28 29 30 31
__global__ void ScatterInitCUDAKernel(const IndexT* indices,
                                      T* output,
                                      size_t index_size,
                                      size_t slice_size) {
Z
Zeng Jinle 已提交
32 33 34
  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
    int64_t indices_i = i / slice_size;
    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
35
    IndexT scatter_i = indices[indices_i];
36 37 38 39 40 41 42 43

    PADDLE_ENFORCE(scatter_i >= 0,
                   "The index is out of bounds, "
                   "please check whether the dimensions of index and "
                   "input meet the requirements. It should "
                   "be greater than or equal to 0, but received [%d]",
                   scatter_i);

Z
Zeng Jinle 已提交
44
    int64_t out_i = scatter_i * slice_size + slice_i;
45 46 47
    *(output + out_i) = static_cast<T>(0);
  }
}
Z
zchen0211 已提交
48

49
template <typename T, typename IndexT = int>
50 51 52 53 54 55
__global__ void ScatterCUDAKernel(const T* params,
                                  const IndexT* indices,
                                  T* output,
                                  size_t index_size,
                                  size_t slice_size,
                                  bool overwrite) {
Z
Zeng Jinle 已提交
56 57 58
  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
    int64_t indices_i = i / slice_size;
    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
59
    IndexT scatter_i = indices[indices_i];
60 61 62 63 64 65 66 67

    PADDLE_ENFORCE(scatter_i >= 0,
                   "The index is out of bounds, "
                   "please check whether the dimensions of index and "
                   "input meet the requirements. It should "
                   "be greater than or equal to 0, but received [%d]",
                   scatter_i);

Z
Zeng Jinle 已提交
68
    int64_t out_i = scatter_i * slice_size + slice_i;
69 70 71 72 73
    if (overwrite) {
      *(output + out_i) = *(params + i);
    } else {
      paddle::platform::CudaAtomicAdd(output + out_i, *(params + i));
    }
Z
zchen0211 已提交
74 75 76
  }
}

77
template <typename T, typename IndexT = int>
78 79 80
__global__ void ScatterNdCUDAKernel(const T* update,
                                    const IndexT* indices,
                                    T* output,
81
                                    const Dim<DDim::kMaxRank> output_dims,
82 83
                                    size_t remain_size,
                                    size_t slice_size,
84
                                    size_t end_size) {
Z
Zeng Jinle 已提交
85 86 87 88
  CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
    int64_t indices_i = i / slice_size;
    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
    int64_t gather_i = 0;
89 90 91
    int64_t temp = slice_size;
    for (int64_t j = end_size - 1; j >= 0; --j) {
      IndexT index_value = indices[indices_i * end_size + j];
92 93 94 95 96 97 98

      PADDLE_ENFORCE(
          index_value >= 0 && index_value < output_dims[j],
          "The index is out of bounds, "
          "please check whether the dimensions of index and "
          "input meet the requirements. It should "
          "be less than [%d] and greater or equal to 0, but received [%d]",
99 100
          output_dims[j],
          index_value);
101

102 103 104
      gather_i += (index_value * temp);
      temp *= output_dims[j];
    }
Z
Zeng Jinle 已提交
105
    int64_t output_i = gather_i + slice_i;
106 107 108 109
    paddle::platform::CudaAtomicAdd(output + output_i, *(update + i));
  }
}

Z
zchen0211 已提交
110 111 112 113 114
/**
 * A thin wrapper on gpu tensor
 * Return a new updated tensor from source tensor, scatter-assigned according to
 * index
 * input[src]: type-T source Tensor
115
 * input[index]: type-IndexT index Tensor (1-D)
Z
zchen0211 已提交
116 117
 * return: output tensor
 */
118
template <typename T, typename IndexT = int>
119 120 121 122
void GPUScatterAssign(const phi::GPUContext& ctx,
                      const DenseTensor& src,
                      const DenseTensor& index,
                      DenseTensor* output,
123
                      bool overwrite = true) {
Z
zchen0211 已提交
124
  // check index of shape 1-D
125
  if (index.dims().size() == 2) {
126 127 128 129 130 131 132
    PADDLE_ENFORCE_EQ(
        index.dims()[1],
        1,
        phi::errors::InvalidArgument("index.dims()[1] should be 1 when "
                                     "index.dims().size() = 2 in scatter_op."
                                     "But received value is [%d]",
                                     index.dims()[1]));
133
  } else {
134 135 136
    PADDLE_ENFORCE_EQ(index.dims().size(),
                      1,
                      phi::errors::InvalidArgument(
137 138 139
                          "index.dims().size() should be 1 or 2 in scatter_op."
                          "But received value is [%d]",
                          index.dims().size()));
140
  }
Z
Zeng Jinle 已提交
141
  int64_t index_size = index.dims()[0];
Z
zchen0211 已提交
142

143
  auto src_dims = src.dims();
144
  phi::DDim output_dims(src_dims);
Z
zchen0211 已提交
145 146 147
  output_dims[0] = index_size;

  // slice size
Z
Zeng Jinle 已提交
148
  int64_t slice_size = 1;
Z
zchen0211 已提交
149 150
  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];

151
  const T* p_src = src.data<T>();
152
  const IndexT* p_index = index.data<IndexT>();
Z
1 api  
zchen0211 已提交
153
  T* p_output = output->data<T>();
154
  const size_t& slice_bytes = slice_size * sizeof(T);
Z
1 api  
zchen0211 已提交
155

156
  // set block and grid num
Z
1 api  
zchen0211 已提交
157
  int block = 512;
Z
Zeng Jinle 已提交
158
  int64_t n = slice_size * index_size;
159 160
  dim3 grid = dim3((n + block - 1) / block);
  paddle::platform::LimitGridDim(ctx, &grid);
Z
1 api  
zchen0211 已提交
161

162 163
  // if not overwrite mode, init data
  if (!overwrite) {
164
    ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
S
ShenLiang 已提交
165
        p_index, p_output, index_size, slice_size);
166 167
  }

168
  ScatterCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
169
      p_src, p_index, p_output, index_size, slice_size, overwrite);
Z
zchen0211 已提交
170 171
}

S
ShenLiang 已提交
172 173 174
// The function is only for scatter grad x,
// however update grad use gather
template <typename T, typename IndexT = int>
175 176 177
void GPUScatterGradForX(const phi::GPUContext& ctx,
                        const DenseTensor& index,
                        DenseTensor* output) {
Z
Zeng Jinle 已提交
178
  int64_t index_size = index.dims()[0];
S
ShenLiang 已提交
179 180
  auto dst_dims = output->dims();
  // slice size
Z
Zeng Jinle 已提交
181
  int64_t slice_size = 1;
S
ShenLiang 已提交
182 183 184 185 186 187 188 189 190
  for (int i = 1; i < dst_dims.size(); ++i) slice_size *= dst_dims[i];
  const IndexT* p_index = index.data<IndexT>();
  T* p_output = output->data<T>();
  const size_t& slice_bytes = slice_size * sizeof(T);

  // set block and grid num
  int64_t block = 512;
  int64_t n = slice_size * index_size;
  int64_t height = (n + block - 1) / block;
191 192
  dim3 grid = dim3((n + block - 1) / block);
  paddle::platform::LimitGridDim(ctx, &grid);
S
ShenLiang 已提交
193

194
  ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
S
ShenLiang 已提交
195 196 197
      p_index, p_output, index_size, slice_size);
}

198 199 200 201 202
template <typename T, typename IndexT = int>
void GPUScatterNdAdd(const phi::GPUContext& ctx,
                     const DenseTensor& update,
                     const DenseTensor& index,
                     DenseTensor* output) {
203 204 205 206 207 208 209 210 211 212 213 214 215
  auto index_dims = index.dims();
  auto index_dims_size = index_dims.size();

  auto output_dims = output->dims();
  auto output_dims_size = output_dims.size();

  const T* p_update = update.data<T>();
  const IndexT* p_index = index.data<IndexT>();
  T* p_output = output->data<T>();

  // final dim
  int64_t end_size = index_dims[index_dims_size - 1];
  // remain dim
216 217
  auto remain_ddim = phi::slice_ddim(index_dims, 0, index_dims_size - 1);
  int64_t remain_numel = phi::product(remain_ddim);
218 219 220 221 222 223 224
  // slice size
  int64_t slice_size = 1;
  for (int64_t i = end_size; i < output_dims_size; ++i) {
    slice_size *= output_dims[i];
  }
  const size_t slice_bytes = slice_size * sizeof(T);

225
  Dim<DDim::kMaxRank> g_output_dims;
226
  for (int i = 0; i < output_dims_size; ++i) {
227
    g_output_dims[i] = output_dims[i];
228
  }
229

230
  int block = 512;
Z
Zeng Jinle 已提交
231
  int64_t n = slice_size * remain_numel;
232 233
  dim3 grid = dim3((n + block - 1) / block);
  paddle::platform::LimitGridDim(ctx, &grid);
234

235 236 237 238 239 240 241
  ScatterNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
      p_update,
      p_index,
      p_output,
      g_output_dims,
      remain_numel,
      slice_size,
242 243 244
      end_size);
}

245
}  // namespace funcs
246
}  // namespace phi