im2col_cfo_cpu.h 11.5 KB
Newer Older
1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
T
tensor-tang 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <vector>
18

19
#include "paddle/phi/core/dense_tensor.h"
T
tensor-tang 已提交
20

21 22
namespace phi {
namespace funcs {
T
tensor-tang 已提交
23

24
/**
T
tensor-tang 已提交
25 26 27 28
 * The most common im2col algorithm.
 * Support dilation, stride and padding.
 */
template <typename T>
29
inline void im2col_common(const phi::DenseTensor& im,
T
tensor-tang 已提交
30 31 32
                          const std::vector<int>& dilation,
                          const std::vector<int>& stride,
                          const std::vector<int>& padding,
33
                          phi::DenseTensor* col,
34 35
                          const DataLayout data_layout = DataLayout::kNCHW) {
  int im_channels =
36
      (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
37
  int im_height =
38
      (data_layout != DataLayout::kNHWC ? im.dims()[1] : im.dims()[0]);
39
  int im_width =
40
      (data_layout != DataLayout::kNHWC ? im.dims()[2] : im.dims()[1]);
T
tensor-tang 已提交
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
  int filter_height = col->dims()[1];
  int filter_width = col->dims()[2];
  int output_height = col->dims()[3];
  int output_width = col->dims()[4];
  int channels_col = im_channels * filter_height * filter_width;

  const T* im_data = im.data<T>();
  T* col_data = col->data<T>();
  for (int c = 0; c < channels_col; ++c) {
    int w_offset = c % filter_width;
    int h_offset = (c / filter_width) % filter_height;
    int c_im = c / (filter_width * filter_height);
    for (int h = 0; h < output_height; ++h) {
      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
      for (int w = 0; w < output_width; ++w) {
        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
57
        int im_idx;
58
        if (data_layout != DataLayout::kNHWC) {
59 60 61 62
          im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
        } else {
          im_idx = (im_row_idx * im_width + im_col_idx) * im_channels + c_im;
        }
T
tensor-tang 已提交
63
        int col_idx = (c * output_height + h) * output_width + w;
64

T
tensor-tang 已提交
65 66 67 68 69 70 71 72 73
        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
                             im_col_idx < 0 || im_col_idx >= im_width)
                                ? static_cast<T>(0)
                                : im_data[im_idx];
      }
    }
  }
}

74
/**
T
tensor-tang 已提交
75
 * im2col algorithm with strides == 1, dilations == 1, paddings == 0
76
 */
T
tensor-tang 已提交
77
template <typename T>
78
inline void im2col_sh1sw1dh1dw1ph0pw0(
79 80
    const phi::DenseTensor& im,
    phi::DenseTensor* col,
81 82
    const DataLayout data_layout = DataLayout::kNCHW) {
  int im_channels =
83
      (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
84
  int im_height =
85
      (data_layout != DataLayout::kNHWC ? im.dims()[1] : im.dims()[0]);
86
  int im_width =
87
      (data_layout != DataLayout::kNHWC ? im.dims()[2] : im.dims()[1]);
T
tensor-tang 已提交
88 89 90 91 92 93 94 95 96 97
  int filter_height = col->dims()[1];
  int filter_width = col->dims()[2];
  int output_height = col->dims()[3];
  int output_width = col->dims()[4];

  const T* im_data = im.data<T>();
  T* col_data = col->data<T>();
  int col_matrix_width = output_width * output_height;
  int im_size = im_height * im_width;
  size_t copy_size = sizeof(T) * output_width;
98 99
  const T* im_data_oh = im_data;
  T* dst_data_oh = col_data;
T
tensor-tang 已提交
100
  for (int oh = 0; oh < output_height; ++oh) {
101 102
    const T* src_data_ic = im_data_oh;
    T* dst_data = dst_data_oh;
T
tensor-tang 已提交
103
    for (int ic = 0; ic < im_channels; ++ic) {
104
      const T* src_data = src_data_ic;
T
tensor-tang 已提交
105 106
      for (int kh = 0; kh < filter_height; ++kh) {
        for (int kw = 0; kw < filter_width; ++kw) {
107
          if (data_layout != DataLayout::kNHWC) {
108 109 110 111 112 113 114
            std::memcpy(dst_data, src_data + kw, copy_size);
          } else {
            for (int kow = 0; kow < output_width; ++kow) {
              dst_data[kow] =
                  im_data[((oh + kh) * im_width + kw + kow) * im_channels + ic];
            }
          }
T
tensor-tang 已提交
115 116 117 118
          dst_data = dst_data + col_matrix_width;
        }
        src_data = src_data + im_width;
      }
119
      src_data_ic = src_data_ic + im_size;
T
tensor-tang 已提交
120
    }
121 122
    im_data_oh = im_data_oh + im_width;
    dst_data_oh = dst_data_oh + output_width;
T
tensor-tang 已提交
123 124 125
  }
}

126 127 128 129
/**
 * im2col algorithm with strides == 1, dilations == 1, paddings == 1
 * and filter_width == 1 have a special implementation
 */
T
tensor-tang 已提交
130
template <typename T>
131 132
inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im,
                                      phi::DenseTensor* col,
133 134
                                      const DataLayout data_layout) {
  int im_channels =
135
      (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
136
  int im_height =
137
      (data_layout != DataLayout::kNHWC ? im.dims()[1] : im.dims()[0]);
138
  int im_width =
139
      (data_layout != DataLayout::kNHWC ? im.dims()[2] : im.dims()[1]);
T
tensor-tang 已提交
140 141 142 143
  int filter_height = col->dims()[1];
  int filter_width = col->dims()[2];
  int output_height = col->dims()[3];
  int output_width = col->dims()[4];
144 145 146 147 148

  constexpr int plh = 1;
  constexpr int prh = 1;
  constexpr int plw = 1;
  constexpr int prw = 1;
T
tensor-tang 已提交
149 150 151 152

  const T* im_data = im.data<T>();
  T* col_data = col->data<T>();
  int im_size = im_height * im_width;
153
  int col_matrix_width = output_width * output_height;
T
tensor-tang 已提交
154 155
  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
156 157 158 159 160 161 162

  // fill height padding
  {
    size_t copy_size = sizeof(T) * output_width;
    T* col_start_l = col_data;
    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
                     col_matrix_width - output_width;
T
tensor-tang 已提交
163
    for (int ic = 0; ic < im_channels; ++ic) {
164 165
      T* dst_data_l = col_start_l;
      T* dst_data_r = col_start_r;
T
tensor-tang 已提交
166
      for (int kw = 0; kw < filter_width; ++kw) {
167 168
        std::memset(dst_data_l, 0, copy_size);
        std::memset(dst_data_r, 0, copy_size);
T
tensor-tang 已提交
169 170 171
        dst_data_l = dst_data_l + col_matrix_width;
        dst_data_r = dst_data_r + col_matrix_width;
      }
172 173
      col_start_l = col_start_l + col_block_ic;
      col_start_r = col_start_r + col_block_ic;
T
tensor-tang 已提交
174 175 176
    }
  }

177 178 179
  auto pad = static_cast<T>(0);
  if (filter_width == 1) {
    // fill width padding
180
    T* dst_data_ic = col_data;
T
tensor-tang 已提交
181
    for (int ic = 0; ic < im_channels; ++ic) {
182
      T* dst_data_kh = dst_data_ic;
T
tensor-tang 已提交
183
      for (int kh = 0; kh < filter_height; ++kh) {
184
        T* dst_data = dst_data_kh;
185 186 187 188 189
        for (int oh = 0; oh < output_height; ++oh) {
          *dst_data = pad;
          dst_data = dst_data + output_width - 1;
          *dst_data = pad;
          ++dst_data;
T
tensor-tang 已提交
190
        }
191
        dst_data_kh = dst_data_kh + col_block_fh;
T
tensor-tang 已提交
192
      }
193
      dst_data_ic = dst_data_ic + col_block_ic;
T
tensor-tang 已提交
194
    }
195 196
    // fill core
    size_t copy_size = sizeof(T) * (output_width - plw - prw);
T
tensor-tang 已提交
197 198 199 200 201 202 203 204 205 206
    for (int oh = 0; oh < output_height; ++oh) {
      const T* im_data_start =
          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
      T* dst_data = col_data + oh * output_width;
      for (int ic = 0; ic < im_channels; ++ic) {
        const T* src_data = im_data_start + ic * im_size;
        for (int kh = 0; kh < filter_height; ++kh) {
          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
                                         kh > (filter_height - prh - 1))) {
            dst_data = dst_data + col_matrix_width;
207
            continue;
T
tensor-tang 已提交
208
          }
209
          if (data_layout != DataLayout::kNHWC) {
210 211 212 213 214 215 216 217 218 219
            std::memcpy(dst_data + plw, src_data, copy_size);
          } else {
            for (int kow = 0; kow < output_width - plw - prw; ++kow) {
              dst_data[plw + kow] =
                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
                           kow) *
                              im_channels +
                          ic];
            }
          }
220
          dst_data = dst_data + col_matrix_width;
T
tensor-tang 已提交
221 222 223 224
          src_data = src_data + im_width;
        }
      }
    }
225 226 227 228 229
    return;
  }

  // filter_width != 1
  // fill width padding
230
  T* dst_data_ic = col_data;
231
  for (int ic = 0; ic < im_channels; ++ic) {
232
    T* dst_data_kh = dst_data_ic;
233 234
    for (int kh = 0; kh < filter_height; ++kh) {
      for (T* dst_data :
235 236 237
           {dst_data_kh,
            dst_data_kh + (filter_width - prw) * col_matrix_width +
                output_width - 1}) {
238 239 240 241 242 243
        // TODO(TJ): from plh, saving repeated assignment
        for (int oh = 0; oh < output_height; ++oh) {
          *dst_data = pad;
          dst_data = dst_data + output_width;
        }
      }
244
      dst_data_kh = dst_data_kh + col_block_fh;
245
    }
246
    dst_data_ic = dst_data_ic + col_block_ic;
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
  }

  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
  // (output_width-1)}
  // length of copy_size is equal kw.
  for (int oh = 0; oh < output_height; ++oh) {
    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
    T* dst_data = col_data + oh * output_width;
    for (int ic = 0; ic < im_channels; ++ic) {
      const T* src_data = im_data_start + ic * im_size;
      for (int kh = 0; kh < filter_height; ++kh) {
        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
                                       kh > (filter_height - prh - 1))) {
          dst_data = dst_data + filter_width * col_matrix_width;
          continue;
        }
        // TODO(TJ): reuse plw-kw outside this for
        // try to unify
        for (int kw = 0; kw < plw; ++kw) {
266
          if (data_layout != DataLayout::kNHWC) {
267 268
            std::memcpy(dst_data + (plw - kw),
                        src_data,
269 270 271 272 273 274 275 276 277 278
                        sizeof(T) * (output_width - (plw - kw)));
          } else {
            for (int kow = 0; kow < output_width - (plw - kw); ++kow) {
              dst_data[plw - kw + kow] =
                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
                           kow) *
                              im_channels +
                          ic];
            }
          }
279 280 281
          dst_data = dst_data + col_matrix_width;
        }
        for (int kw = plw; kw < filter_width - prw; ++kw) {
282
          if (data_layout != DataLayout::kNHWC) {
283 284
            std::memcpy(
                dst_data, src_data + (kw - plw), sizeof(T) * output_width);
285 286 287 288 289 290 291 292 293
          } else {
            for (int kow = 0; kow < output_width; ++kow) {
              dst_data[kow] =
                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
                           kw - plw + kow) *
                              im_channels +
                          ic];
            }
          }
294 295 296 297
          dst_data = dst_data + col_matrix_width;
        }
        int i = 1;
        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
298
          if (data_layout != DataLayout::kNHWC) {
299 300
            std::memcpy(dst_data,
                        src_data + (kw - plw),
301 302 303 304 305 306 307 308 309 310
                        sizeof(T) * (output_width - i));
          } else {
            for (int kow = 0; kow < output_width - i; ++kow) {
              dst_data[kow] =
                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
                           kw - plw + kow) *
                              im_channels +
                          ic];
            }
          }
311 312 313 314 315
          dst_data = dst_data + col_matrix_width;
        }
        src_data = src_data + im_width;
      }
    }
T
tensor-tang 已提交
316 317 318
  }
}

319 320
}  // namespace funcs
}  // namespace phi