concat.cu 9.9 KB
Newer Older
C
chengduoZH 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

C
chengduoZH 已提交
15
#include "paddle/fluid/framework/mixed_vector.h"
C
chengduoZH 已提交
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/platform/cuda_helper.h"

namespace paddle {
namespace operators {
namespace math {

template <typename T>
__device__ T upper_bound(const T* first, T count, T val) {
  const T* orig = first;
  const T* it = nullptr;
  T step = 0;
  while (count > 0) {
    it = first;
    step = count / 2;
    it += step;
    if (!(val < *it)) {
      first = ++it;
      count -= step + 1;
    } else {
      count = step;
    }
  }
  return first - orig;
}

template <typename T>
C
chengduoZH 已提交
43
__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
C
chengduoZH 已提交
44 45 46
                             const int output_rows, const int output_cols,
                             T* output) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
C
chengduoZH 已提交
47
  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
C
chengduoZH 已提交
48

C
chengduoZH 已提交
49
  int curr_offset = input_cols[segment];
C
chengduoZH 已提交
50 51 52
  int curr_segment = segment;
  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
    T curr_col_offset;
C
chengduoZH 已提交
53
    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
C
chengduoZH 已提交
54 55 56 57 58 59
      curr_offset = curr_col_offset;
      ++curr_segment;
    }

    int local_col = tid_x - curr_offset;
    int segment_width = curr_col_offset - curr_offset;
C
chengduoZH 已提交
60
    T* input_ptr = inputs[curr_segment];
C
chengduoZH 已提交
61
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
C
chengduoZH 已提交
62 63 64 65 66 67
    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
      output[tid_y * output_cols + tid_x] =
          input_ptr[tid_y * segment_width + local_col];
  }
}

C
chengduoZH 已提交
68
template <typename T>
C
chengduoZH 已提交
69 70 71
__global__ void KernelConcat(T** inputs, const int input_col,
                             const int output_rows, const int output_cols,
                             T* output) {
C
chengduoZH 已提交
72 73
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
C
chengduoZH 已提交
74
    int split = tid_x * 1.0 / input_col;
C
chengduoZH 已提交
75
    int in_offset = tid_x - split * input_col;
C
chengduoZH 已提交
76
    T* input_ptr = inputs[split];
C
chengduoZH 已提交
77
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
C
chengduoZH 已提交
78
    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
C
chengduoZH 已提交
79 80
      output[tid_y * output_cols + tid_x] =
          input_ptr[tid_y * input_col + in_offset];
C
chengduoZH 已提交
81
    }
C
chengduoZH 已提交
82 83 84 85 86
  }
}

template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row,
C
chengduoZH 已提交
87 88
                                 const int input_col, const int* output_cols,
                                 int col_size, T** outputs) {
C
chengduoZH 已提交
89
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
C
chengduoZH 已提交
90 91
  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
  int curr_offset = output_cols[segment];
C
chengduoZH 已提交
92 93 94
  int curr_segment = segment;
  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
    T curr_col_offset;
C
chengduoZH 已提交
95
    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
C
chengduoZH 已提交
96 97 98 99 100 101
      curr_offset = curr_col_offset;
      ++curr_segment;
    }

    int local_col = tid_x - curr_offset;
    int segment_width = curr_col_offset - curr_offset;
C
chengduoZH 已提交
102
    T* output_ptr = outputs[curr_segment];
C
chengduoZH 已提交
103 104 105 106 107 108 109 110 111
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
      output_ptr[tid_y * segment_width + local_col] =
          input[tid_y * input_col + tid_x];
  }
}

template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row,
C
chengduoZH 已提交
112
                                 const int input_col, const int output_col,
C
chengduoZH 已提交
113
                                 T** outputs) {
C
chengduoZH 已提交
114 115
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
C
chengduoZH 已提交
116 117
    int split = tid_x / output_col;
    int in_offset = tid_x - split * output_col;
C
chengduoZH 已提交
118
    T* output_ptr = outputs[split];
C
chengduoZH 已提交
119 120
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
C
chengduoZH 已提交
121
      output_ptr[tid_y * output_col + in_offset] =
C
chengduoZH 已提交
122 123 124 125
          input[tid_y * input_col + tid_x];
  }
}

C
chengduoZH 已提交
126
/*
C
chengduoZH 已提交
127 128
 * All tensors' dimension should be the same and the values of
 * each dimension are the same, except the axis dimension.
C
chengduoZH 已提交
129 130 131 132 133
 */
template <typename T>
class ConcatFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
C
chengduoZH 已提交
134
                  const std::vector<framework::Tensor>& input, const int axis,
C
chengduoZH 已提交
135
                  framework::Tensor* output) {
C
chengduoZH 已提交
136
    // TODO(zcd): Add input data validity checking
C
chengduoZH 已提交
137 138 139 140 141 142 143 144 145
    int num = input.size();
    int rows = 1;
    auto dim_0 = input[0].dims();
    for (int i = 0; i < axis; ++i) {
      rows *= dim_0[i];
    }
    int cols = input[0].numel() / rows;
    int out_rows = rows, out_cols = 0;

146 147
    framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
    framework::Vector<int> inputs_cols(num + 1);
C
chengduoZH 已提交
148 149 150 151
    inputs_cols[0] = 0;
    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());

    bool sameShape = true;
C
chengduoZH 已提交
152 153 154 155 156 157
    for (int i = 0; i < num; ++i) {
      int t_cols = input[i].numel() / rows;
      if (sameShape) {
        if (t_cols != cols) sameShape = false;
      }
      out_cols += t_cols;
C
chengduoZH 已提交
158 159
      inputs_cols[i + 1] = out_cols;
      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
C
chengduoZH 已提交
160 161
    }

C
chengduoZH 已提交
162 163 164 165
    T** ins_gpu =
        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());

C
chengduoZH 已提交
166
    // computation
C
chengduoZH 已提交
167 168
    // set the thread block and grid according to CurrentDeviceId
    const int kThreadsPerBlock = 1024;
169 170 171 172 173
    int block_cols = kThreadsPerBlock;
    if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
      block_cols = ((out_cols + 31) >> 5) << 5;
    }
    int block_rows = kThreadsPerBlock / block_cols;
C
chengduoZH 已提交
174 175
    dim3 block_size = dim3(block_cols, block_rows, 1);

176
    int max_threads = context.GetMaxPhysicalThreadCount();
C
chengduoZH 已提交
177 178 179 180 181 182
    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);

    int grid_cols =
        std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
    int grid_rows =
        std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
C
chengduoZH 已提交
183 184
    dim3 grid_size = dim3(grid_cols, grid_rows, 1);

C
chengduoZH 已提交
185 186
    if (sameShape) {
      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
C
chengduoZH 已提交
187
          ins_gpu, cols, out_rows, out_cols, output->data<T>());
C
chengduoZH 已提交
188 189
    } else {
      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
C
chengduoZH 已提交
190 191
          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
          out_cols, output->data<T>());
C
chengduoZH 已提交
192 193 194 195
    }
  }
};

C
chengduoZH 已提交
196 197 198 199
/*
 * All tensors' dimension should be the same and the values of
 * each dimension are the same, except the axis dimension.
 */
C
chengduoZH 已提交
200 201 202 203 204 205
template <typename T>
class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor& input, const int axis,
                  std::vector<framework::Tensor>& outputs) {
C
chengduoZH 已提交
206
    // TODO(zcd): Add input data validity checking
C
chengduoZH 已提交
207 208 209 210 211 212 213 214 215 216 217
    int num = outputs.size();
    int input_row = 1;
    auto dim_0 = outputs[0].dims();
    for (int i = 0; i < axis; ++i) {
      input_row *= dim_0[i];
    }

    int output_col_0 = outputs[0].numel() / input_row;
    int input_col = 0;
    bool sameShape = true;

218 219
    framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
    framework::Vector<int> outputs_cols(num + 1);
C
chengduoZH 已提交
220 221
    outputs_cols[0] = 0;
    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
C
chengduoZH 已提交
222

C
chengduoZH 已提交
223 224 225 226 227 228
    for (int i = 0; i < num; ++i) {
      int t_col = outputs[i].numel() / input_row;
      if (sameShape) {
        if (t_col != output_col_0) sameShape = false;
      }
      input_col += t_col;
C
chengduoZH 已提交
229 230
      outputs_cols[i + 1] = input_col;
      outputs_ptr[i] = outputs[i].data<T>();
C
chengduoZH 已提交
231 232
    }

C
chengduoZH 已提交
233 234 235 236
    T** outs_gpu =
        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());

C
chengduoZH 已提交
237
    // computation
C
chengduoZH 已提交
238
    const int kThreadsPerBlock = 1024;
239 240 241 242 243
    int block_cols = kThreadsPerBlock;
    if (input_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
      block_cols = ((input_col + 31) >> 5) << 5;
    }
    int block_rows = kThreadsPerBlock / block_cols;
C
chengduoZH 已提交
244 245
    dim3 block_size = dim3(block_cols, block_rows, 1);

246 247 248 249 250 251 252
    int max_threads = context.GetMaxPhysicalThreadCount();
    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);

    int grid_cols =
        std::min((input_col + block_cols - 1) / block_cols, max_blocks);
    int grid_rows =
        std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
C
chengduoZH 已提交
253 254 255 256
    dim3 grid_size = dim3(grid_cols, grid_rows, 1);

    if (sameShape) {
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
C
chengduoZH 已提交
257
          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
C
chengduoZH 已提交
258 259
    } else {
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
C
chengduoZH 已提交
260 261
          input.data<T>(), input_row, input_col, outs_col_gpu,
          static_cast<int>(outputs_cols.size()), outs_gpu);
C
chengduoZH 已提交
262
    }
C
chengduoZH 已提交
263 264 265 266 267 268 269 270
  }
};

template class ConcatFunctor<platform::CUDADeviceContext, int>;
template class ConcatFunctor<platform::CUDADeviceContext, int64_t>;
template class ConcatFunctor<platform::CUDADeviceContext, float>;
template class ConcatFunctor<platform::CUDADeviceContext, double>;

C
chengduoZH 已提交
271 272 273 274 275
template class ConcatGradFunctor<platform::CUDADeviceContext, int>;
template class ConcatGradFunctor<platform::CUDADeviceContext, int64_t>;
template class ConcatGradFunctor<platform::CUDADeviceContext, float>;
template class ConcatGradFunctor<platform::CUDADeviceContext, double>;

C
chengduoZH 已提交
276 277 278
}  // namespace math
}  // namespace operators
}  // namespace paddle