im2col.cu 15.6 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
H
hedaoyuan 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15 16
#include <algorithm>
#include <vector>
Y
Yi Wang 已提交
17
#include "paddle/fluid/operators/math/im2col.h"
D
dzhwinter 已提交
18
#include "paddle/fluid/platform/cuda_primitives.h"
H
hedaoyuan 已提交
19 20

namespace paddle {
21
namespace operators {
22
namespace math {
H
hedaoyuan 已提交
23 24

template <class T>
C
chengduoZH 已提交
25 26
__global__ void im2col(const T* data_im, int num_outs, int im_height,
                       int im_width, int dilation_h, int dilation_w,
H
hedaoyuan 已提交
27 28
                       int filter_height, int filter_width, int stride_height,
                       int stride_width, int padding_height, int padding_width,
C
chengduoZH 已提交
29 30 31
                       int col_height, int col_width, T* data_col) {
  const int index =
      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
H
hedaoyuan 已提交
32
  if (index < num_outs) {
C
chengduoZH 已提交
33 34 35
    int w_out = index % col_width;
    int h_out = (index / col_width) % col_height;
    int channel_in = index / col_width / col_height;
H
hedaoyuan 已提交
36
    int channel_out = channel_in * filter_height * filter_width;
C
chengduoZH 已提交
37 38
    int h_in = h_out * stride_height - padding_height;
    int w_in = w_out * stride_width - padding_width;
H
hedaoyuan 已提交
39

C
chengduoZH 已提交
40 41
    data_col += (channel_out * col_height + h_out) * col_width + w_out;
    data_im += (channel_in * im_height + h_in) * im_width + w_in;
H
hedaoyuan 已提交
42 43
    for (int i = 0; i < filter_height; ++i) {
      for (int j = 0; j < filter_width; ++j) {
C
chengduoZH 已提交
44 45 46 47 48 49 50
        int rIdx = h_in + i * dilation_h;
        int cIdx = w_in + j * dilation_w;
        *data_col =
            (rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0)
                ? 0
                : data_im[i * dilation_h * im_width + j * dilation_w];
        data_col += col_height * col_width;
H
hedaoyuan 已提交
51 52 53 54 55 56
      }
    }
  }
}

/*
H
hedaoyuan 已提交
57 58 59
 * im = [input_channels, input_height, input_width]
 * col =
 *   [input_channels, filter_height, filter_width, output_height, output_width]
H
hedaoyuan 已提交
60 61
 */
template <class T>
H
hedaoyuan 已提交
62
class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
Q
QI JUN 已提交
63
                    platform::CUDADeviceContext, T> {
H
hedaoyuan 已提交
64
 public:
Q
QI JUN 已提交
65
  void operator()(const platform::CUDADeviceContext& context,
C
chengduoZH 已提交
66 67 68
                  const framework::Tensor& im, const std::vector<int>& dilation,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding, framework::Tensor* col) {
H
hedaoyuan 已提交
69
    PADDLE_ENFORCE(im.dims().size() == 3);
C
chengduoZH 已提交
70
    PADDLE_ENFORCE(col->dims().size() == 5);
H
hedaoyuan 已提交
71

C
chengduoZH 已提交
72 73 74
    int im_channels = im.dims()[0];
    int im_height = im.dims()[1];
    int im_width = im.dims()[2];
C
chengduoZH 已提交
75 76 77 78 79
    int filter_height = col->dims()[1];
    int filter_width = col->dims()[2];
    int col_height = col->dims()[3];
    int col_width = col->dims()[4];

C
chengduoZH 已提交
80
    int num_outputs = im_channels * col_height * col_width;
H
hedaoyuan 已提交
81 82 83
    int blocks = (num_outputs + 1024 - 1) / 1024;
    int block_x = 512;
    int block_y = (blocks + 512 - 1) / 512;
H
hedaoyuan 已提交
84
    dim3 threads(1024, 1);
H
hedaoyuan 已提交
85
    dim3 grid(block_x, block_y);
Q
QI JUN 已提交
86
    im2col<T><<<grid, threads, 0, context.stream()>>>(
C
chengduoZH 已提交
87 88 89
        im.data<T>(), num_outputs, im_height, im_width, dilation[0],
        dilation[1], filter_height, filter_width, stride[0], stride[1],
        padding[0], padding[1], col_height, col_width, col->data<T>());
H
hedaoyuan 已提交
90 91 92 93
  }
};

template <class T>
C
chengduoZH 已提交
94 95 96 97 98 99
__global__ void col2im(int n, const T* data_col, int im_height, int im_width,
                       int dilation_h, int dilation_w, int filter_height,
                       int filter_width, int stride_height, int stride_width,
                       int padding_height, int padding_width, int col_height,
                       int col_width, T* data_im) {
  const int index =
H
hedaoyuan 已提交
100
      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
C
chengduoZH 已提交
101 102 103 104

  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
  const int d_filter_width = dilation_w * (filter_width - 1) + 1;

H
hedaoyuan 已提交
105 106
  if (index < n) {
    T val = 0;
C
chengduoZH 已提交
107 108
    int w = index % im_width + padding_width;
    int h = (index / im_width) % im_height + padding_height;
C
chengduoZH 已提交
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
    int c = index / (im_width * im_height);

    // compute the start and end of the output
    int w_col_start =
        (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
    int w_col_end = min(w / stride_width + 1, col_width);
    int h_col_start =
        (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
    int h_col_end = min(h / stride_height + 1, col_height);

    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
        int h_off = (h - h_col * stride_height);
        int w_off = (w - w_col * stride_width);
        if (h_off % dilation_h == 0 && w_off % dilation_w == 0) {
          h_off /= dilation_h;
          w_off /= dilation_w;
          int data_col_index =
              (((c * filter_height + h_off) * filter_width + w_off) *
                   col_height +
               h_col) *
                  col_width +
              w_col;
C
chengduoZH 已提交
132

C
chengduoZH 已提交
133
          val += data_col[data_col_index];
H
hedaoyuan 已提交
134 135 136
        }
      }
    }
C
chengduoZH 已提交
137
    data_im[index] = val;
H
hedaoyuan 已提交
138 139 140 141
  }
}

/*
H
hedaoyuan 已提交
142 143 144
 * im = [input_channels, input_height, input_width]
 * col =
 *   [input_channels, filter_height, filter_width, output_height, output_width]
H
hedaoyuan 已提交
145 146
 */
template <class T>
H
hedaoyuan 已提交
147
class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
Q
QI JUN 已提交
148
                    platform::CUDADeviceContext, T> {
H
hedaoyuan 已提交
149
 public:
Q
QI JUN 已提交
150
  void operator()(const platform::CUDADeviceContext& context,
C
chengduoZH 已提交
151 152 153 154 155
                  const framework::Tensor& col,
                  const std::vector<int>& dilation,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding, framework::Tensor* im) {
    PADDLE_ENFORCE(im->dims().size() == 3);
H
hedaoyuan 已提交
156 157
    PADDLE_ENFORCE(col.dims().size() == 5);

C
chengduoZH 已提交
158 159 160
    int im_channels = im->dims()[0];
    int im_height = im->dims()[1];
    int im_width = im->dims()[2];
H
hedaoyuan 已提交
161 162
    int filter_height = col.dims()[1];
    int filter_width = col.dims()[2];
C
chengduoZH 已提交
163 164 165
    int col_height = col.dims()[3];
    int col_width = col.dims()[4];

C
chengduoZH 已提交
166 167 168
    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
                       (dilation[0] * (filter_height - 1) + 1)) /
                              stride[0] +
C
chengduoZH 已提交
169 170 171 172
                          1,
                      col_height,
                      "Output_height and padding(padding_up, padding_down) are "
                      "inconsistent.");
C
chengduoZH 已提交
173 174 175
    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                       (dilation[1] * (filter_width - 1) + 1)) /
                              stride[1] +
C
chengduoZH 已提交
176 177 178 179 180 181
                          1,
                      col_width,
                      "col_width and padding(padding_left, padding_right) are "
                      "inconsistent.");

    size_t num_kernels = im_channels * im_height * im_width;
H
hedaoyuan 已提交
182

H
hedaoyuan 已提交
183 184 185
    size_t blocks = (num_kernels + 1024 - 1) / 1024;
    size_t block_x = 512;
    size_t block_y = (blocks + 512 - 1) / 512;
H
hedaoyuan 已提交
186
    dim3 threads(1024, 1);
H
hedaoyuan 已提交
187
    dim3 grid(block_x, block_y);
H
hedaoyuan 已提交
188 189 190

    // To avoid involving atomic operations, we will launch one kernel per
    // bottom dimension, and then in the kernel add up the top dimensions.
Q
QI JUN 已提交
191
    col2im<T><<<grid, threads, 0, context.stream()>>>(
C
chengduoZH 已提交
192 193 194
        num_kernels, col.data<T>(), im_height, im_width, dilation[0],
        dilation[1], filter_height, filter_width, stride[0], stride[1],
        padding[0], padding[2], col_height, col_width, im->data<T>());
H
hedaoyuan 已提交
195 196 197
  }
};

H
hedaoyuan 已提交
198
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
Q
QI JUN 已提交
199
                             platform::CUDADeviceContext, float>;
H
hedaoyuan 已提交
200
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
Q
QI JUN 已提交
201
                             platform::CUDADeviceContext, double>;
H
hedaoyuan 已提交
202
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
Q
QI JUN 已提交
203
                             platform::CUDADeviceContext, float>;
H
hedaoyuan 已提交
204
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
Q
QI JUN 已提交
205
                             platform::CUDADeviceContext, double>;
H
hedaoyuan 已提交
206 207

template <class T>
C
chengduoZH 已提交
208 209 210
__global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
                          int im_width, int filter_height, int filter_width,
                          int stride_height, int stride_width,
C
chengduoZH 已提交
211
                          int padding_height, int padding_width, int col_height,
C
chengduoZH 已提交
212
                          int col_width, T* col_data) {
H
hedaoyuan 已提交
213 214
  int swid = blockIdx.x;
  int shid = blockIdx.y;
C
chengduoZH 已提交
215
  for (int channelid = threadIdx.z; channelid < im_channels;
H
hedaoyuan 已提交
216 217 218 219
       channelid += blockDim.z) {
    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
        int width_offset = idx + swid * stride_width - padding_width;
C
chengduoZH 已提交
220
        int height_offset = idy + shid * stride_height - padding_height;
C
chengduoZH 已提交
221 222
        int im_offset = width_offset + height_offset * im_width +
                        channelid * im_height * im_width;
H
hedaoyuan 已提交
223

H
hedaoyuan 已提交
224 225
        int col_offset = idx + idy * filter_width +
                         channelid * filter_height * filter_width +
C
chengduoZH 已提交
226 227 228 229 230 231 232 233
                         (shid * col_width + swid) *
                             (im_channels * filter_height * filter_width);

        col_data[col_offset] =
            (height_offset >= im_height || height_offset < 0 ||
             width_offset >= im_width || width_offset < 0)
                ? T(0)
                : im_data[im_offset];
H
hedaoyuan 已提交
234 235 236 237 238 239
      }
    }
  }
}

/*
H
hedaoyuan 已提交
240 241 242
 * im = [input_channels, input_height, input_width]
 * col =
 *   [output_height, output_width, input_channels, filter_height, filter_width]
H
hedaoyuan 已提交
243 244
 */
template <class T>
H
hedaoyuan 已提交
245
class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
Q
QI JUN 已提交
246
                    platform::CUDADeviceContext, T> {
H
hedaoyuan 已提交
247
 public:
Q
QI JUN 已提交
248
  void operator()(const platform::CUDADeviceContext& context,
C
chengduoZH 已提交
249 250 251
                  const framework::Tensor& im, const std::vector<int>& dilation,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding, framework::Tensor* col) {
H
hedaoyuan 已提交
252
    PADDLE_ENFORCE(im.dims().size() == 3);
C
chengduoZH 已提交
253
    PADDLE_ENFORCE(col->dims().size() == 5);
C
chengduoZH 已提交
254 255 256
    int im_channels = im.dims()[0];
    int im_height = im.dims()[1];
    int im_width = im.dims()[2];
C
chengduoZH 已提交
257 258 259 260 261
    int filter_height = col->dims()[3];
    int filter_width = col->dims()[4];
    int col_height = col->dims()[0];
    int col_width = col->dims()[1];

H
hedaoyuan 已提交
262 263 264 265 266 267 268 269 270 271 272
    int block_dim_x = 0;
    int block_dim_y = 0;
    if (filter_height <= 4 && filter_width <= 4) {
      block_dim_x = 4;
      block_dim_y = 4;
    } else if (filter_height <= 8 && filter_width <= 8) {
      block_dim_x = 8;
      block_dim_y = 8;
    } else if (filter_height <= 16 && filter_width <= 16) {
      block_dim_x = 16;
      block_dim_y = 16;
H
hedaoyuan 已提交
273
    } else {
H
hedaoyuan 已提交
274 275
      block_dim_x = 32;
      block_dim_y = 32;
H
hedaoyuan 已提交
276 277
    }

H
hedaoyuan 已提交
278
    int block_dim_z = 1024 / block_dim_x / block_dim_y;
C
chengduoZH 已提交
279 280
    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
    dim3 grid(col_width, col_height);
Q
QI JUN 已提交
281
    im2colOCF<T><<<grid, threads, 0, context.stream()>>>(
C
chengduoZH 已提交
282 283 284
        im.data<T>(), im_channels, im_height, im_width, filter_height,
        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
        col_width, col->data<T>());
H
hedaoyuan 已提交
285 286 287 288
  }
};

template <class T>
C
chengduoZH 已提交
289 290 291
__global__ void col2imOCF(const T* col_data, int im_channels, int im_height,
                          int im_width, int filter_height, int filter_width,
                          int stride_height, int stride_width,
C
chengduoZH 已提交
292
                          int padding_height, int padding_width, int col_height,
C
chengduoZH 已提交
293
                          int col_width, T* im_data) {
H
hedaoyuan 已提交
294 295
  int swid = blockIdx.x;
  int shid = blockIdx.y;
C
chengduoZH 已提交
296
  for (int channelid = threadIdx.z; channelid < im_channels;
H
hedaoyuan 已提交
297 298 299 300
       channelid += blockDim.z) {
    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
        int width_offset = idx + swid * stride_width - padding_width;
C
chengduoZH 已提交
301
        int height_offset = idy + shid * stride_height - padding_height;
C
chengduoZH 已提交
302 303
        int im_offset = width_offset + height_offset * im_width +
                        channelid * im_height * im_width;
H
hedaoyuan 已提交
304

H
hedaoyuan 已提交
305 306
        int col_offset = idx + idy * filter_width +
                         channelid * filter_height * filter_width +
C
chengduoZH 已提交
307 308
                         (shid * col_width + swid) *
                             (im_channels * filter_height * filter_width);
H
hedaoyuan 已提交
309

C
chengduoZH 已提交
310 311
        if (height_offset >= 0 && height_offset < im_height &&
            width_offset >= 0 && width_offset < im_width) {
H
hedaoyuan 已提交
312 313
          paddle::platform::CudaAtomicAdd(im_data + im_offset,
                                          col_data[col_offset]);
H
hedaoyuan 已提交
314 315 316 317 318 319 320
        }
      }
    }
  }
}

/*
H
hedaoyuan 已提交
321 322 323
 * im = [input_channels, input_height, input_width]
 * col =
 *   [output_height, output_width, input_channels, filter_height, filter_width]
H
hedaoyuan 已提交
324 325
 */
template <class T>
H
hedaoyuan 已提交
326
class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
Q
QI JUN 已提交
327
                    platform::CUDADeviceContext, T> {
H
hedaoyuan 已提交
328
 public:
Q
QI JUN 已提交
329
  void operator()(const platform::CUDADeviceContext& context,
C
chengduoZH 已提交
330 331 332 333 334
                  const framework::Tensor& col,
                  const std::vector<int>& dilation,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding, framework::Tensor* im) {
    PADDLE_ENFORCE(im->dims().size() == 3);
H
hedaoyuan 已提交
335
    PADDLE_ENFORCE(col.dims().size() == 5);
C
chengduoZH 已提交
336 337 338
    int im_channels = im->dims()[0];
    int im_height = im->dims()[1];
    int im_width = im->dims()[2];
H
hedaoyuan 已提交
339 340
    int filter_height = col.dims()[3];
    int filter_width = col.dims()[4];
C
chengduoZH 已提交
341 342 343
    int col_height = col.dims()[0];
    int col_width = col.dims()[1];

C
chengduoZH 已提交
344 345 346
    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
                       (dilation[0] * (filter_height - 1) + 1)) /
                              stride[0] +
C
chengduoZH 已提交
347 348 349 350
                          1,
                      col_height,
                      "Output_height and padding(padding_up, padding_down) are "
                      "inconsistent.");
C
chengduoZH 已提交
351 352 353
    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                       (dilation[1] * (filter_width - 1) + 1)) /
                              stride[1] +
C
chengduoZH 已提交
354 355 356 357
                          1,
                      col_width,
                      "col_width and padding(padding_left, padding_right) are "
                      "inconsistent.");
C
chengduoZH 已提交
358

H
hedaoyuan 已提交
359 360 361 362 363 364 365 366 367 368 369
    int block_dim_x = 0;
    int block_dim_y = 0;
    if (filter_height <= 4 && filter_width <= 4) {
      block_dim_x = 4;
      block_dim_y = 4;
    } else if (filter_height <= 8 && filter_width <= 8) {
      block_dim_x = 8;
      block_dim_y = 8;
    } else if (filter_height <= 16 && filter_width <= 16) {
      block_dim_x = 16;
      block_dim_y = 16;
H
hedaoyuan 已提交
370
    } else {
H
hedaoyuan 已提交
371 372
      block_dim_x = 32;
      block_dim_y = 32;
H
hedaoyuan 已提交
373 374
    }

H
hedaoyuan 已提交
375
    int block_dim_z = 1024 / block_dim_x / block_dim_y;
C
chengduoZH 已提交
376 377
    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
    dim3 grid(col_width, col_height);
Q
QI JUN 已提交
378
    col2imOCF<T><<<grid, threads, 0, context.stream()>>>(
C
chengduoZH 已提交
379 380 381
        col.data<T>(), im_channels, im_height, im_width, filter_height,
        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
        col_width, im->data<T>());
H
hedaoyuan 已提交
382 383 384
  }
};

H
hedaoyuan 已提交
385
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
Q
QI JUN 已提交
386
                             platform::CUDADeviceContext, float>;
H
hedaoyuan 已提交
387
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
Q
QI JUN 已提交
388
                             platform::CUDADeviceContext, double>;
H
hedaoyuan 已提交
389
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
Q
QI JUN 已提交
390
                             platform::CUDADeviceContext, float>;
H
hedaoyuan 已提交
391
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
Q
QI JUN 已提交
392
                             platform::CUDADeviceContext, double>;
H
hedaoyuan 已提交
393

394
}  // namespace math
395
}  // namespace operators
H
hedaoyuan 已提交
396
}  // namespace paddle