gather.cu.h 11.4 KB
Newer Older
1
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Z
zchen0211 已提交
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Z
zchen0211 已提交
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
Z
zchen0211 已提交
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Z
zchen0211 已提交
14 15

#pragma once
16

17
#include <vector>
18 19
#include "paddle/fluid/memory/memcpy.h"
// TODO(paddle-dev): move gpu_primitives.h to phi
20
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
21 22 23
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
24
#include "paddle/phi/kernels/funcs/math_function.h"
Z
zchen0211 已提交
25

26 27
namespace phi {
namespace funcs {
Z
zchen0211 已提交
28

29
template <typename T, typename IndexT = int>
30 31 32 33
__global__ void GatherCUDAKernel(const T* params,
                                 const IndexT* indices,
                                 T* output,
                                 size_t index_size,
34
                                 size_t slice_size) {
35 36 37
  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
    int64_t indices_i = i / slice_size;
    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
38
    IndexT gather_i = indices[indices_i];
Z
Zeng Jinle 已提交
39
    int64_t params_i = gather_i * slice_size + slice_i;
Z
zchen0211 已提交
40 41 42 43
    *(output + i) = *(params + params_i);
  }
}

44
template <typename T, typename IndexT = int>
45
__global__ void GatherNdCUDAKernel(const T* input,
46
                                   const Dim<DDim::kMaxRank> input_dims,
47 48 49 50
                                   const IndexT* indices,
                                   T* output,
                                   size_t remain_size,
                                   size_t slice_size,
51
                                   size_t end_size) {
52 53 54
  CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
    int64_t indices_i = i / slice_size;
    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
Z
Zeng Jinle 已提交
55
    int64_t gather_i = 0;
56 57 58
    int64_t temp = slice_size;
    for (int64_t j = end_size - 1; j >= 0; --j) {
      auto index_value = indices[indices_i * end_size + j];
59 60 61 62 63
      PADDLE_ENFORCE(
          index_value >= 0 && index_value < input_dims[j],
          "The index is out of bounds, "
          "please check whether the dimensions of index and "
          "input meet the requirements. It should "
64
          "be less than [%d] and greater than or equal to 0, but received [%d]",
65 66
          input_dims[j],
          index_value);
67 68 69
      gather_i += (index_value * temp);
      temp *= input_dims[j];
    }
Z
Zeng Jinle 已提交
70
    int64_t input_i = gather_i + slice_i;
71 72 73 74
    *(output + i) = *(input + input_i);
  }
}

Z
zchen0211 已提交
75 76 77 78
/**
 * A thin wrapper on gpu tensor
 * Return a new tensor from source tensor, gathered according to index
 * input[src]: type-T source Tensor
79
 * input[index]: type-IndexT index Tensor (1-D)
Z
zchen0211 已提交
80 81
 * return: output tensor
 */
82
template <typename T, typename IndexT = int>
83 84 85 86
void GPUGather(const phi::GPUContext& ctx,
               const DenseTensor& src,
               const DenseTensor& index,
               DenseTensor* output) {
Z
Zeng Jinle 已提交
87
  if (index.dims().size() == 2) {
88 89 90 91 92
    PADDLE_ENFORCE_EQ(
        index.dims()[1],
        1,
        phi::errors::InvalidArgument("If the index's rank of gather_op is 2,"
                                     " the second dimension should be 1."));
C
chengduo 已提交
93
  }
Y
Yibing Liu 已提交
94

95
  // index size
96
  int64_t index_size = index.dims()[0];
Z
Zeng Jinle 已提交
97
  if (index_size == 0) return;
Z
zchen0211 已提交
98

99
  auto src_dims = src.dims();
100
  phi::DDim output_dims(src_dims);
Z
zchen0211 已提交
101 102 103
  output_dims[0] = index_size;

  // slice size
104
  int64_t slice_size = 1;
Z
zchen0211 已提交
105 106
  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];

107
  const T* p_src = src.data<T>();
108
  const IndexT* p_index = index.data<IndexT>();
Z
1 api  
zchen0211 已提交
109 110 111
  T* p_output = output->data<T>();

  int block = 512;
112 113
  int64_t n = slice_size * index_size;
  int64_t grid = (n + block - 1) / block;
114 115 116 117
  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
  if (grid > maxGridDimX) {
    grid = maxGridDimX;
  }
Z
zchen0211 已提交
118

119
  GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
120
      p_src, p_index, p_output, index_size, slice_size);
Z
zchen0211 已提交
121 122
}

123 124 125 126 127
template <typename T, typename IndexT = int>
void GPUGatherNd(const phi::GPUContext& ctx,
                 const DenseTensor& input,
                 const DenseTensor& index,
                 DenseTensor* output) {
128
  const auto gplace = ctx.GetPlace();
129
  auto cplace = phi::CPUPlace();
130 131 132 133 134 135 136 137 138 139 140 141 142

  auto index_dims = index.dims();
  auto index_dims_size = index_dims.size();
  auto input_dims = input.dims();
  auto input_dims_size = input_dims.size();

  const T* p_input = input.data<T>();
  const IndexT* p_index = index.data<IndexT>();
  T* p_output = output->data<T>();

  // final dim
  int64_t end_size = index_dims[index_dims_size - 1];
  // remain dim
143 144
  auto remain_ddim = phi::slice_ddim(index_dims, 0, index_dims_size - 1);
  int64_t remain_numel = phi::product(remain_ddim);
145 146 147 148 149 150
  // slice size
  int64_t slice_size = 1;
  for (int64_t i = end_size; i < input_dims_size; ++i) {
    slice_size *= input_dims[i];
  }
  // source dim
151
  Dim<DDim::kMaxRank> g_input_dims;
152
  for (int i = 0; i < input_dims_size; ++i) {
153
    g_input_dims[i] = input_dims[i];
154 155 156
  }

  int block = 512;
157 158
  int64_t n = slice_size * remain_numel;
  int64_t grid = (n + block - 1) / block;
159 160 161 162
  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
  if (grid > maxGridDimX) {
    grid = maxGridDimX;
  }
163

164 165 166 167 168 169 170
  GatherNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(p_input,
                                                                  g_input_dims,
                                                                  p_index,
                                                                  p_output,
                                                                  remain_numel,
                                                                  slice_size,
                                                                  end_size);
171 172
}

173
template <typename T, typename U>
174 175 176 177 178
__global__ void GatherGPUKernel(const T* input,
                                const U* index,
                                T* out,
                                int64_t outer_dim_size,
                                int64_t inner_dim_size,
179
                                int64_t out_index_dim_size,
180 181
                                int64_t input_index_dim_size,
                                int64_t size) {
182 183
  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
  int64_t outer_size = outer_dim_size * out_index_dim_size;
184
  for (; idx < size; idx += blockDim.x * gridDim.x) {
185 186 187 188
    int64_t inner_dim_index = idx / outer_size;
    int64_t next_idx = idx - outer_size * inner_dim_index;
    int64_t index_dim_index = next_idx / outer_dim_size;
    U index_val = index[index_dim_index];
189 190 191 192 193 194 195

    PADDLE_ENFORCE(
        index_val >= 0 && index_val < input_index_dim_size,
        "The index is out of bounds, "
        "please check whether the dimensions of index and "
        "input meet the requirements. It should "
        "be less than [%d] and greater than or equal to 0, but received [%d]",
196 197
        input_index_dim_size,
        index_val);
198

199 200
    int64_t out_dim_index = next_idx - outer_dim_size * index_dim_index;
    int64_t input_index =
201
        inner_dim_index * (outer_dim_size * input_index_dim_size) +
202
        index_val * outer_dim_size + out_dim_index;
203 204 205 206 207
    out[idx] = input[input_index];
  }
}

template <typename T, typename U>
208 209 210
__global__ void GatherGradGPUKernel(const T* input,
                                    const U* index,
                                    T* out,
211 212 213
                                    int64_t outer_dim_size,
                                    int64_t inner_dim_size,
                                    int64_t input_index_dim_size,
214 215
                                    int64_t out_index_dim_size,
                                    int64_t size) {
216
  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
217
  for (; idx < size; idx += blockDim.x * gridDim.x) {
218 219 220 221 222 223 224
    int64_t inner_dim_index = idx / (outer_dim_size * input_index_dim_size);
    int64_t next_idx = idx % (outer_dim_size * input_index_dim_size);
    int64_t index_dim_index = next_idx / (outer_dim_size);
    int64_t out_dim_index = next_idx % outer_dim_size;
    int64_t out_index =
        inner_dim_index * (outer_dim_size * out_index_dim_size) +
        index[index_dim_index] * outer_dim_size + out_dim_index;
225 226 227 228
    paddle::platform::CudaAtomicAdd(out + out_index, *(input + idx));
  }
}

229
template <typename T, typename U>
230 231 232 233 234
void GatherV2CUDAFunction(const DenseTensor* input,
                          const DenseTensor* index,
                          const int axis,
                          DenseTensor* out,
                          const phi::GPUContext& ctx) {
235 236
  int64_t index_size = index->numel();
  int64_t input_size = input->numel();
237 238 239 240 241
  auto input_dim = input->dims();
  auto* input_data = input->data<T>();
  auto* index_data = index->data<U>();

  if (input->numel() == 0) return;
242 243

  int axis_index = axis;
244
  int64_t index_dim_size = input_dim[axis_index];
245

246 247 248
  int64_t inner_dim_size = 1;
  int64_t outer_dim_size = 1;
  std::vector<int64_t> out_dim_vec;
249 250 251 252 253 254 255 256 257 258

  for (int i = 0; i < axis_index; i++) {
    inner_dim_size *= input_dim[i];
    out_dim_vec.push_back(input_dim[i]);
  }
  out_dim_vec.push_back(index_size);
  for (int i = axis_index + 1; i < input_dim.size(); i++) {
    outer_dim_size *= input_dim[i];
    out_dim_vec.push_back(input_dim[i]);
  }
259
  auto out_dim = phi::make_ddim(out_dim_vec);
260 261

  out->Resize(out_dim);
262
  auto* out_data = ctx.Alloc<T>(out);
263
  int64_t out_size = out->numel();
Z
Zeng Jinle 已提交
264
  if (out_size == 0) return;
265

266 267
  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
  auto stream = ctx.stream();
268
  GatherGPUKernel<
269 270 271 272 273 274 275 276 277 278
      T,
      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
      input_data,
      index_data,
      out_data,
      outer_dim_size,
      inner_dim_size,
      index_size,
      index_dim_size,
      out_size);
279 280
}

281
template <typename T, typename U>
282 283 284 285 286
void GatherV2GradCUDAFunction(const DenseTensor* input,
                              const DenseTensor* index,
                              const int axis,
                              DenseTensor* out,
                              const phi::GPUContext& ctx) {
287
  auto* index_data = index->data<U>();
288 289
  int64_t index_size = index->numel();
  int64_t input_size = input->numel();
290 291 292 293
  auto input_dim = input->dims();
  auto* input_data = input->data<T>();

  if (input->numel() == 0) return;
294
  int axis_index = axis;
295
  int64_t input_index_dim_size = input_dim[axis_index];
296

297 298
  int64_t inner_dim_size = 1;
  int64_t outer_dim_size = 1;
299 300 301 302 303 304 305 306

  for (int i = 0; i < axis_index; i++) {
    inner_dim_size *= input_dim[i];
  }
  for (int i = axis_index + 1; i < input_dim.size(); i++) {
    outer_dim_size *= input_dim[i];
  }

307
  auto* out_data = ctx.Alloc<T>(out);
308
  auto out_dim = out->dims();
309
  int64_t out_index_dim_size = out_dim[axis_index];
310
  phi::funcs::set_constant(ctx, out, 0.0);
311

312 313
  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, input_size);
  auto stream = ctx.stream();
314
  GatherGradGPUKernel<
315 316 317 318 319 320 321 322 323 324
      T,
      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
      input_data,
      index_data,
      out_data,
      outer_dim_size,
      inner_dim_size,
      input_index_dim_size,
      out_index_dim_size,
      input_size);
325
}
326 327 328

}  // namespace funcs
}  // namespace phi