diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index a50b9ace39249f4f899a46e171bbdced033b46bc..bb55ce21b0599bcff4db138a46c9c700f6e52422 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -40,22 +40,47 @@ class Im2ColFunctordims()[1]; int filter_width = col->dims()[2]; - int col_height = col->dims()[3]; - int col_width = col->dims()[4]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; int channels_col = im_channels * filter_height * filter_width; const T* im_data = im.data(); T* col_data = col->data(); + // TODO(TJ): change me to template + // further optimaze: + // 1. padding != 1 + // 2. could also support stride_h != 1 + if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 && + dilation[1] == 1 && padding[0] == 0 && padding[1] == 0) { + int col_matrix_width = output_width * output_height; + size_t copy_size = sizeof(T) * output_width; + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = im_data + oh * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_height * im_width; + for (int kh = 0; kh < filter_height; ++kh) { + for (int kw = 0; kw < filter_width; ++kw) { + std::memcpy(dst_data, src_data + kw, copy_size); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + } + } + return; + } + for (int c = 0; c < channels_col; ++c) { int w_offset = c % filter_width; int h_offset = (c / filter_width) % filter_height; int c_im = c / (filter_width * filter_height); - for (int h = 0; h < col_height; ++h) { + for (int h = 0; h < output_height; ++h) { int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < col_width; ++w) { + for (int w = 0; w < output_width; ++w) { int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * col_height + h) * col_width + w; + int col_idx = (c * output_height + h) * output_width + w; int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index 8e3f0f286823c383bb0c44d0e7887040ec9b20a0..db61f68db3e492d98cfa43576fa1900bffc8674d 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -160,8 +160,80 @@ void testIm2col() { delete context; } +void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { + paddle::framework::Tensor input; + paddle::framework::Tensor output; + paddle::framework::Tensor ref_output; + std::vector padding({ph, pw}); + std::vector stride({1, 1}); // stride_y, stride_x + std::vector dilation({1, 1}); // dilation_y, dilation_x + int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; + int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; + float* input_ptr = + input.mutable_data({ic, ih, iw}, paddle::platform::CPUPlace()); + for (int i = 0; i < input.numel(); ++i) { + input_ptr[i] = static_cast(i + 1); + } + + paddle::platform::CPUPlace place; + paddle::platform::CPUDeviceContext context(place); + output.mutable_data({ic, fh, fw, output_height, output_width}, place); + ref_output.mutable_data({ic, fh, fw, output_height, output_width}, + place); + paddle::operators::math::Im2ColFunctor< + paddle::operators::math::ColFormat::kCFO, + paddle::platform::CPUDeviceContext, float> + im2col; + im2col(context, input, dilation, stride, padding, &output); + auto ref_im2col = [&]( + const paddle::framework::Tensor& im, const std::vector& dilation, + const std::vector& stride, const std::vector& padding, + paddle::framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + int channels_col = im_channels * filter_height * filter_width; + + const float* im_data = im.data(); + float* col_data = col->data(); + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / (filter_width * filter_height); + for (int h = 0; h < output_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + for (int w = 0; w < output_width; ++w) { + int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + int col_idx = (c * output_height + h) * output_width + w; + int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; + col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || + im_col_idx < 0 || im_col_idx >= im_width) + ? 0.f + : im_data[im_idx]; + } + } + } + }; + + ref_im2col(input, dilation, stride, padding, &ref_output); + + float* out_cfo_ptr = output.data(); + float* out_ref_ptr = ref_output.data(); + for (int i = 0; i < output.numel(); ++i) { + EXPECT_EQ(out_cfo_ptr[i], out_ref_ptr[i]); + } +} + TEST(math, im2col) { testIm2col(); + testIm2colCPU(/*ic*/ 3, /*ih*/ 5, /*iw*/ 5, /*fh*/ 3, /*fw*/ 2, /*ph*/ 0, + /*pw*/ 0); + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1, + /*pw*/ 1); #ifdef PADDLE_WITH_CUDA testIm2col();