diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index bb55ce21b0599bcff4db138a46c9c700f6e52422..1472edbbf47e3e4d6b22c65349713904b13647d2 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include +#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" namespace paddle { namespace operators { @@ -35,61 +36,18 @@ class Im2ColFunctordims().size() == 5); - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int output_height = col->dims()[3]; - int output_width = col->dims()[4]; - - int channels_col = im_channels * filter_height * filter_width; - - const T* im_data = im.data(); - T* col_data = col->data(); - // TODO(TJ): change me to template - // further optimaze: - // 1. padding != 1 - // 2. could also support stride_h != 1 if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 && - dilation[1] == 1 && padding[0] == 0 && padding[1] == 0) { - int col_matrix_width = output_width * output_height; - size_t copy_size = sizeof(T) * output_width; - for (int oh = 0; oh < output_height; ++oh) { - const T* im_data_start = im_data + oh * im_width; - T* dst_data = col_data + oh * output_width; - for (int ic = 0; ic < im_channels; ++ic) { - const T* src_data = im_data_start + ic * im_height * im_width; - for (int kh = 0; kh < filter_height; ++kh) { - for (int kw = 0; kw < filter_width; ++kw) { - std::memcpy(dst_data, src_data + kw, copy_size); - dst_data = dst_data + col_matrix_width; - } - src_data = src_data + im_width; - } - } - } - return; - } - - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < output_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < output_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * output_height + h) * output_width + w; - int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? static_cast(0) - : im_data[im_idx]; - } + dilation[1] == 1) { + if (padding[0] == 0 && padding[1] == 0) { + im2col_sh1sw1dh1dw1ph0pw0(im, col); + return; + } else if (padding[0] == 1 && padding[1] == 1) { + im2col_sh1sw1dh1dw1ph1pw1(im, col); + return; } + // TODO(TJ): complete padding >=2 } + im2col_common(im, dilation, stride, padding, col); } }; diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..0d32bc5bd0d7f25479370959cabeb9b9c9e7e2d6 --- /dev/null +++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h @@ -0,0 +1,252 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +/** + * The most common im2col algorithm. + * Support dilation, stride and padding. + */ +template +inline void im2col_common(const framework::Tensor& im, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + int channels_col = im_channels * filter_height * filter_width; + + const T* im_data = im.data(); + T* col_data = col->data(); + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / (filter_width * filter_height); + for (int h = 0; h < output_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + for (int w = 0; w < output_width; ++w) { + int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + int col_idx = (c * output_height + h) * output_width + w; + int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; + col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || + im_col_idx < 0 || im_col_idx >= im_width) + ? static_cast(0) + : im_data[im_idx]; + } + } + } +} + +/** + * im2col algorithm with strides == 1, dilations == 1, paddings == 0 + */ +template +inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + + const T* im_data = im.data(); + T* col_data = col->data(); + int col_matrix_width = output_width * output_height; + int im_size = im_height * im_width; + size_t copy_size = sizeof(T) * output_width; + const T* im_data_oh = im_data; + T* dst_data_oh = col_data; + for (int oh = 0; oh < output_height; ++oh) { + const T* src_data_ic = im_data_oh; + T* dst_data = dst_data_oh; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = src_data_ic; + for (int kh = 0; kh < filter_height; ++kh) { + for (int kw = 0; kw < filter_width; ++kw) { + std::memcpy(dst_data, src_data + kw, copy_size); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + src_data_ic = src_data_ic + im_size; + } + im_data_oh = im_data_oh + im_width; + dst_data_oh = dst_data_oh + output_width; + } +} + +/** + * im2col algorithm with strides == 1, dilations == 1, paddings == 1 + * and filter_width == 1 have a special implementation + */ +template +inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + + constexpr int plh = 1; + constexpr int prh = 1; + constexpr int plw = 1; + constexpr int prw = 1; + + const T* im_data = im.data(); + T* col_data = col->data(); + int im_size = im_height * im_width; + int col_matrix_width = output_width * output_height; + int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow + int col_block_ic = filter_height * col_block_fh; // fh*fw*oh*ow + + // fill height padding + { + size_t copy_size = sizeof(T) * output_width; + T* col_start_l = col_data; + T* col_start_r = col_data + (filter_height - 1) * col_block_fh + + col_matrix_width - output_width; + for (int ic = 0; ic < im_channels; ++ic) { + T* dst_data_l = col_start_l; + T* dst_data_r = col_start_r; + for (int kw = 0; kw < filter_width; ++kw) { + std::memset(dst_data_l, 0, copy_size); + std::memset(dst_data_r, 0, copy_size); + dst_data_l = dst_data_l + col_matrix_width; + dst_data_r = dst_data_r + col_matrix_width; + } + col_start_l = col_start_l + col_block_ic; + col_start_r = col_start_r + col_block_ic; + } + } + + auto pad = static_cast(0); + if (filter_width == 1) { + // fill width padding + T* dst_data_ic = col_data; + for (int ic = 0; ic < im_channels; ++ic) { + T* dst_data_kh = dst_data_ic; + for (int kh = 0; kh < filter_height; ++kh) { + T* dst_data = dst_data_kh; + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width - 1; + *dst_data = pad; + ++dst_data; + } + dst_data_kh = dst_data_kh + col_block_fh; + } + dst_data_ic = dst_data_ic + col_block_ic; + } + // fill core + size_t copy_size = sizeof(T) * (output_width - plw - prw); + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = + im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && + kh > (filter_height - prh - 1))) { + dst_data = dst_data + col_matrix_width; + continue; + } + std::memcpy(dst_data + plw, src_data, copy_size); + dst_data = dst_data + col_matrix_width; + src_data = src_data + im_width; + } + } + } + return; + } + + // filter_width != 1 + // fill width padding + T* dst_data_ic = col_data; + for (int ic = 0; ic < im_channels; ++ic) { + T* dst_data_kh = dst_data_ic; + for (int kh = 0; kh < filter_height; ++kh) { + for (T* dst_data : + {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width + + output_width - 1}) { + // TODO(TJ): from plh, saving repeated assignment + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width; + } + } + dst_data_kh = dst_data_kh + col_block_fh; + } + dst_data_ic = dst_data_ic + col_block_ic; + } + + // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * + // (output_width-1)} + // length of copy_size is equal kw. + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && + kh > (filter_height - prh - 1))) { + dst_data = dst_data + filter_width * col_matrix_width; + continue; + } + // TODO(TJ): reuse plw-kw outside this for + // try to unify + for (int kw = 0; kw < plw; ++kw) { + std::memcpy(dst_data + (plw - kw), src_data, + sizeof(T) * (output_width - (plw - kw))); + dst_data = dst_data + col_matrix_width; + } + for (int kw = plw; kw < filter_width - prw; ++kw) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * output_width); + dst_data = dst_data + col_matrix_width; + } + int i = 1; + for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * (output_width - i)); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + } + } +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index db61f68db3e492d98cfa43576fa1900bffc8674d..ae2c90b33a4298ada4fd01aa2a44ebdf10d036d4 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include +#include #include +#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" template void testIm2col() { @@ -160,82 +162,111 @@ void testIm2col() { delete context; } -void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { - paddle::framework::Tensor input; - paddle::framework::Tensor output; - paddle::framework::Tensor ref_output; - std::vector padding({ph, pw}); - std::vector stride({1, 1}); // stride_y, stride_x - std::vector dilation({1, 1}); // dilation_y, dilation_x - int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; - int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; - float* input_ptr = - input.mutable_data({ic, ih, iw}, paddle::platform::CPUPlace()); - for (int i = 0; i < input.numel(); ++i) { - input_ptr[i] = static_cast(i + 1); - } - - paddle::platform::CPUPlace place; - paddle::platform::CPUDeviceContext context(place); - output.mutable_data({ic, fh, fw, output_height, output_width}, place); - ref_output.mutable_data({ic, fh, fw, output_height, output_width}, - place); - paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kCFO, - paddle::platform::CPUDeviceContext, float> - im2col; - im2col(context, input, dilation, stride, padding, &output); - auto ref_im2col = [&]( - const paddle::framework::Tensor& im, const std::vector& dilation, - const std::vector& stride, const std::vector& padding, - paddle::framework::Tensor* col) { - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int output_height = col->dims()[3]; - int output_width = col->dims()[4]; - int channels_col = im_channels * filter_height * filter_width; - - const float* im_data = im.data(); - float* col_data = col->data(); - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < output_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < output_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * output_height + h) * output_width + w; - int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? 0.f - : im_data[im_idx]; - } - } - } - }; - - ref_im2col(input, dilation, stride, padding, &ref_output); - - float* out_cfo_ptr = output.data(); - float* out_ref_ptr = ref_output.data(); - for (int i = 0; i < output.numel(); ++i) { - EXPECT_EQ(out_cfo_ptr[i], out_ref_ptr[i]); - } -} - TEST(math, im2col) { testIm2col(); - testIm2colCPU(/*ic*/ 3, /*ih*/ 5, /*iw*/ 5, /*fh*/ 3, /*fw*/ 2, /*ph*/ 0, - /*pw*/ 0); - testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1, - /*pw*/ 1); #ifdef PADDLE_WITH_CUDA testIm2col(); #endif } + +#define PREPARE_IM2COL_CPU \ + paddle::platform::CPUPlace place; \ + paddle::platform::CPUDeviceContext context(place); \ + paddle::framework::Tensor input; \ + paddle::framework::Tensor out; \ + paddle::framework::Tensor ref; \ + std::vector padding({ph, pw}); \ + std::vector stride({1, 1}); \ + std::vector dilation({1, 1}); \ + float* input_ptr = input.mutable_data({ic, ih, iw}, place); \ + for (int i = 0; i < input.numel(); ++i) { \ + input_ptr[i] = static_cast(i + 1); \ + } \ + int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; \ + int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; \ + out.mutable_data({ic, fh, fw, output_height, output_width}, place); \ + ref.mutable_data({ic, fh, fw, output_height, output_width}, place); \ + paddle::operators::math::Im2ColFunctor< \ + paddle::operators::math::ColFormat::kCFO, \ + paddle::platform::CPUDeviceContext, float> \ + im2col + +void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { + PREPARE_IM2COL_CPU; + + im2col(context, input, dilation, stride, padding, &out); + paddle::operators::math::im2col_common(input, dilation, stride, + padding, &ref); + + float* ref_data = ref.data(); + float* out_data = out.data(); + for (int i = 0; i < out.numel(); ++i) { + EXPECT_EQ(out_data[i], ref_data[i]); + } +} + +void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { + PREPARE_IM2COL_CPU; + constexpr int repeat = 100; + auto GetCurrentMs = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec; + }; + auto t1 = GetCurrentMs(); + for (int i = 0; i < repeat; ++i) { + im2col(context, input, dilation, stride, padding, &out); + } + auto t2 = GetCurrentMs(); + + for (int i = 0; i < repeat; ++i) { + paddle::operators::math::im2col_common(input, dilation, stride, + padding, &ref); + } + auto t3 = GetCurrentMs(); + + LOG(INFO) << "before: " << (t3 - t2) / repeat + << ",after: " << (t2 - t1) / repeat + << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%"; +} + +TEST(math, im2col_cputest) { + // padding_h == padding_w + for (int p = 0; p < 4; ++p) { + // width == height + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 5, /*fh*/ 4, /*fw*/ 4, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 2, /*ph*/ p, + /*pw*/ p); + + // height != width + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); + + // filter == 1 + testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); + } + + // padding_h != padding_w + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1, + /*pw*/ 2); + + // benchmark + for (int p : {0, 1}) { + for (int k : {1, 3, 5}) { + LOG(INFO) << "padding == " << p << ", filter == " << k; + benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k, + /*ph*/ p, /*pw*/ p); + } + } +}