im2col_cfo_cpu.h 11.5 KB
Newer Older
T
tensor-tang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <vector>
#include "paddle/fluid/framework/tensor.h"

namespace paddle {
namespace operators {
namespace math {

24
/**
T
tensor-tang 已提交
25 26 27 28 29 30 31 32
 * The most common im2col algorithm.
 * Support dilation, stride and padding.
 */
template <typename T>
inline void im2col_common(const framework::Tensor& im,
                          const std::vector<int>& dilation,
                          const std::vector<int>& stride,
                          const std::vector<int>& padding,
33 34 35 36 37 38 39 40
                          framework::Tensor* col,
                          const DataLayout data_layout = DataLayout::kNCHW) {
  int im_channels =
      (data_layout == DataLayout::kNCHW ? im.dims()[0] : im.dims()[2]);
  int im_height =
      (data_layout == DataLayout::kNCHW ? im.dims()[1] : im.dims()[0]);
  int im_width =
      (data_layout == DataLayout::kNCHW ? im.dims()[2] : im.dims()[1]);
T
tensor-tang 已提交
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
  int filter_height = col->dims()[1];
  int filter_width = col->dims()[2];
  int output_height = col->dims()[3];
  int output_width = col->dims()[4];
  int channels_col = im_channels * filter_height * filter_width;

  const T* im_data = im.data<T>();
  T* col_data = col->data<T>();
  for (int c = 0; c < channels_col; ++c) {
    int w_offset = c % filter_width;
    int h_offset = (c / filter_width) % filter_height;
    int c_im = c / (filter_width * filter_height);
    for (int h = 0; h < output_height; ++h) {
      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
      for (int w = 0; w < output_width; ++w) {
        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
57 58 59 60 61 62
        int im_idx;
        if (data_layout == DataLayout::kNCHW) {
          im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
        } else {
          im_idx = (im_row_idx * im_width + im_col_idx) * im_channels + c_im;
        }
T
tensor-tang 已提交
63
        int col_idx = (c * output_height + h) * output_width + w;
64

T
tensor-tang 已提交
65 66 67 68 69 70 71 72 73
        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
                             im_col_idx < 0 || im_col_idx >= im_width)
                                ? static_cast<T>(0)
                                : im_data[im_idx];
      }
    }
  }
}

74
/**
T
tensor-tang 已提交
75
 * im2col algorithm with strides == 1, dilations == 1, paddings == 0
76
 */
T
tensor-tang 已提交
77
template <typename T>
78 79 80 81 82 83 84 85 86
inline void im2col_sh1sw1dh1dw1ph0pw0(
    const framework::Tensor& im, framework::Tensor* col,
    const DataLayout data_layout = DataLayout::kNCHW) {
  int im_channels =
      (data_layout == DataLayout::kNCHW ? im.dims()[0] : im.dims()[2]);
  int im_height =
      (data_layout == DataLayout::kNCHW ? im.dims()[1] : im.dims()[0]);
  int im_width =
      (data_layout == DataLayout::kNCHW ? im.dims()[2] : im.dims()[1]);
T
tensor-tang 已提交
87 88 89 90 91 92 93 94 95 96
  int filter_height = col->dims()[1];
  int filter_width = col->dims()[2];
  int output_height = col->dims()[3];
  int output_width = col->dims()[4];

  const T* im_data = im.data<T>();
  T* col_data = col->data<T>();
  int col_matrix_width = output_width * output_height;
  int im_size = im_height * im_width;
  size_t copy_size = sizeof(T) * output_width;
97 98
  const T* im_data_oh = im_data;
  T* dst_data_oh = col_data;
T
tensor-tang 已提交
99
  for (int oh = 0; oh < output_height; ++oh) {
100 101
    const T* src_data_ic = im_data_oh;
    T* dst_data = dst_data_oh;
T
tensor-tang 已提交
102
    for (int ic = 0; ic < im_channels; ++ic) {
103
      const T* src_data = src_data_ic;
T
tensor-tang 已提交
104 105
      for (int kh = 0; kh < filter_height; ++kh) {
        for (int kw = 0; kw < filter_width; ++kw) {
106 107 108 109 110 111 112 113
          if (data_layout == DataLayout::kNCHW) {
            std::memcpy(dst_data, src_data + kw, copy_size);
          } else {
            for (int kow = 0; kow < output_width; ++kow) {
              dst_data[kow] =
                  im_data[((oh + kh) * im_width + kw + kow) * im_channels + ic];
            }
          }
T
tensor-tang 已提交
114 115 116 117
          dst_data = dst_data + col_matrix_width;
        }
        src_data = src_data + im_width;
      }
118
      src_data_ic = src_data_ic + im_size;
T
tensor-tang 已提交
119
    }
120 121
    im_data_oh = im_data_oh + im_width;
    dst_data_oh = dst_data_oh + output_width;
T
tensor-tang 已提交
122 123 124
  }
}

125 126 127 128
/**
 * im2col algorithm with strides == 1, dilations == 1, paddings == 1
 * and filter_width == 1 have a special implementation
 */
T
tensor-tang 已提交
129
template <typename T>
130
inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
131 132 133 134 135 136 137 138
                                      framework::Tensor* col,
                                      const DataLayout data_layout) {
  int im_channels =
      (data_layout == DataLayout::kNCHW ? im.dims()[0] : im.dims()[2]);
  int im_height =
      (data_layout == DataLayout::kNCHW ? im.dims()[1] : im.dims()[0]);
  int im_width =
      (data_layout == DataLayout::kNCHW ? im.dims()[2] : im.dims()[1]);
T
tensor-tang 已提交
139 140 141 142
  int filter_height = col->dims()[1];
  int filter_width = col->dims()[2];
  int output_height = col->dims()[3];
  int output_width = col->dims()[4];
143 144 145 146 147

  constexpr int plh = 1;
  constexpr int prh = 1;
  constexpr int plw = 1;
  constexpr int prw = 1;
T
tensor-tang 已提交
148 149 150 151

  const T* im_data = im.data<T>();
  T* col_data = col->data<T>();
  int im_size = im_height * im_width;
152
  int col_matrix_width = output_width * output_height;
T
tensor-tang 已提交
153 154
  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
155 156 157 158 159 160 161

  // fill height padding
  {
    size_t copy_size = sizeof(T) * output_width;
    T* col_start_l = col_data;
    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
                     col_matrix_width - output_width;
T
tensor-tang 已提交
162
    for (int ic = 0; ic < im_channels; ++ic) {
163 164
      T* dst_data_l = col_start_l;
      T* dst_data_r = col_start_r;
T
tensor-tang 已提交
165
      for (int kw = 0; kw < filter_width; ++kw) {
166 167
        std::memset(dst_data_l, 0, copy_size);
        std::memset(dst_data_r, 0, copy_size);
T
tensor-tang 已提交
168 169 170
        dst_data_l = dst_data_l + col_matrix_width;
        dst_data_r = dst_data_r + col_matrix_width;
      }
171 172
      col_start_l = col_start_l + col_block_ic;
      col_start_r = col_start_r + col_block_ic;
T
tensor-tang 已提交
173 174 175
    }
  }

176 177 178
  auto pad = static_cast<T>(0);
  if (filter_width == 1) {
    // fill width padding
179
    T* dst_data_ic = col_data;
T
tensor-tang 已提交
180
    for (int ic = 0; ic < im_channels; ++ic) {
181
      T* dst_data_kh = dst_data_ic;
T
tensor-tang 已提交
182
      for (int kh = 0; kh < filter_height; ++kh) {
183
        T* dst_data = dst_data_kh;
184 185 186 187 188
        for (int oh = 0; oh < output_height; ++oh) {
          *dst_data = pad;
          dst_data = dst_data + output_width - 1;
          *dst_data = pad;
          ++dst_data;
T
tensor-tang 已提交
189
        }
190
        dst_data_kh = dst_data_kh + col_block_fh;
T
tensor-tang 已提交
191
      }
192
      dst_data_ic = dst_data_ic + col_block_ic;
T
tensor-tang 已提交
193
    }
194 195
    // fill core
    size_t copy_size = sizeof(T) * (output_width - plw - prw);
T
tensor-tang 已提交
196 197 198 199 200 201 202 203 204 205
    for (int oh = 0; oh < output_height; ++oh) {
      const T* im_data_start =
          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
      T* dst_data = col_data + oh * output_width;
      for (int ic = 0; ic < im_channels; ++ic) {
        const T* src_data = im_data_start + ic * im_size;
        for (int kh = 0; kh < filter_height; ++kh) {
          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
                                         kh > (filter_height - prh - 1))) {
            dst_data = dst_data + col_matrix_width;
206
            continue;
T
tensor-tang 已提交
207
          }
208 209 210 211 212 213 214 215 216 217 218
          if (data_layout == DataLayout::kNCHW) {
            std::memcpy(dst_data + plw, src_data, copy_size);
          } else {
            for (int kow = 0; kow < output_width - plw - prw; ++kow) {
              dst_data[plw + kow] =
                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
                           kow) *
                              im_channels +
                          ic];
            }
          }
219
          dst_data = dst_data + col_matrix_width;
T
tensor-tang 已提交
220 221 222 223
          src_data = src_data + im_width;
        }
      }
    }
224 225 226 227 228
    return;
  }

  // filter_width != 1
  // fill width padding
229
  T* dst_data_ic = col_data;
230
  for (int ic = 0; ic < im_channels; ++ic) {
231
    T* dst_data_kh = dst_data_ic;
232 233 234 235 236 237 238 239 240 241
    for (int kh = 0; kh < filter_height; ++kh) {
      for (T* dst_data :
           {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width +
                             output_width - 1}) {
        // TODO(TJ): from plh, saving repeated assignment
        for (int oh = 0; oh < output_height; ++oh) {
          *dst_data = pad;
          dst_data = dst_data + output_width;
        }
      }
242
      dst_data_kh = dst_data_kh + col_block_fh;
243
    }
244
    dst_data_ic = dst_data_ic + col_block_ic;
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
  }

  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
  // (output_width-1)}
  // length of copy_size is equal kw.
  for (int oh = 0; oh < output_height; ++oh) {
    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
    T* dst_data = col_data + oh * output_width;
    for (int ic = 0; ic < im_channels; ++ic) {
      const T* src_data = im_data_start + ic * im_size;
      for (int kh = 0; kh < filter_height; ++kh) {
        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
                                       kh > (filter_height - prh - 1))) {
          dst_data = dst_data + filter_width * col_matrix_width;
          continue;
        }
        // TODO(TJ): reuse plw-kw outside this for
        // try to unify
        for (int kw = 0; kw < plw; ++kw) {
264 265 266 267 268 269 270 271 272 273 274 275
          if (data_layout == DataLayout::kNCHW) {
            std::memcpy(dst_data + (plw - kw), src_data,
                        sizeof(T) * (output_width - (plw - kw)));
          } else {
            for (int kow = 0; kow < output_width - (plw - kw); ++kow) {
              dst_data[plw - kw + kow] =
                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
                           kow) *
                              im_channels +
                          ic];
            }
          }
276 277 278
          dst_data = dst_data + col_matrix_width;
        }
        for (int kw = plw; kw < filter_width - prw; ++kw) {
279 280 281 282 283 284 285 286 287 288 289 290
          if (data_layout == DataLayout::kNCHW) {
            std::memcpy(dst_data, src_data + (kw - plw),
                        sizeof(T) * output_width);
          } else {
            for (int kow = 0; kow < output_width; ++kow) {
              dst_data[kow] =
                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
                           kw - plw + kow) *
                              im_channels +
                          ic];
            }
          }
291 292 293 294
          dst_data = dst_data + col_matrix_width;
        }
        int i = 1;
        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
295 296 297 298 299 300 301 302 303 304 305 306
          if (data_layout == DataLayout::kNCHW) {
            std::memcpy(dst_data, src_data + (kw - plw),
                        sizeof(T) * (output_width - i));
          } else {
            for (int kow = 0; kow < output_width - i; ++kow) {
              dst_data[kow] =
                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
                           kw - plw + kow) *
                              im_channels +
                          ic];
            }
          }
307 308 309 310 311
          dst_data = dst_data + col_matrix_width;
        }
        src_data = src_data + im_width;
      }
    }
T
tensor-tang 已提交
312 313 314 315 316 317
  }
}

}  // namespace math
}  // namespace operators
}  // namespace paddle