From 65d418f060507999d74c7adca0575e8b991e60b4 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 27 Jul 2018 15:27:08 +0800 Subject: [PATCH] complete im2col with padding==1 and speedup filter width==1 --- paddle/fluid/operators/math/im2col.cc | 8 +- paddle/fluid/operators/math/im2col_cfo_cpu.h | 218 +++++++++---------- paddle/fluid/operators/math/im2col_test.cc | 12 +- 3 files changed, 113 insertions(+), 125 deletions(-) diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index 478900e1c..1472edbbf 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -40,10 +40,12 @@ class Im2ColFunctor(im, col); - } else { - im2col_sh1sw1dh1dw1(im, padding, col); + return; + } else if (padding[0] == 1 && padding[1] == 1) { + im2col_sh1sw1dh1dw1ph1pw1(im, col); + return; } - return; + // TODO(TJ): complete padding >=2 } im2col_common(im, dilation, stride, padding, col); } diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h index af581f321..ac843cdc7 100644 --- a/paddle/fluid/operators/math/im2col_cfo_cpu.h +++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h @@ -21,7 +21,7 @@ namespace paddle { namespace operators { namespace math { -/* +/** * The most common im2col algorithm. * Support dilation, stride and padding. */ @@ -61,9 +61,9 @@ inline void im2col_common(const framework::Tensor& im, } } -/* +/** * im2col algorithm with strides == 1, dilations == 1, paddings == 0 - * */ + */ template inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im, framework::Tensor* col) { @@ -96,11 +96,13 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im, } } -// further optimize: padding == 1 need special +/** + * im2col algorithm with strides == 1, dilations == 1, paddings == 1 + * and filter_width == 1 have a special implementation + */ template -inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im, - const std::vector& padding, - framework::Tensor* col) { +inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im, + framework::Tensor* col) { int im_channels = im.dims()[0]; int im_height = im.dims()[1]; int im_width = im.dims()[2]; @@ -108,119 +110,57 @@ inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im, int filter_width = col->dims()[2]; int output_height = col->dims()[3]; int output_width = col->dims()[4]; - constexpr int sh = 1; - constexpr int sw = 1; + + constexpr int plh = 1; + constexpr int prh = 1; + constexpr int plw = 1; + constexpr int prw = 1; const T* im_data = im.data(); T* col_data = col->data(); - int col_matrix_width = output_width * output_height; int im_size = im_height * im_width; - - int plh = padding[0]; - int plw = padding[1]; - int prh = (output_height - 1) * sh + filter_height - im_height - plh; - int prw = (output_width - 1) * sw + filter_width - im_width - plw; - - // fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1) - // TODO(TJ): refine ph*xxx - assert(plh == prh); // because stride_h == 1 + int col_matrix_width = output_width * output_height; int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow int col_block_ic = filter_height * col_block_fh; // fh*fw*oh*ow - for (int ph = 0; ph < plh; ++ph) { - int sz = output_width * (plh - ph); - size_t copy_sz = sizeof(T) * sz; - T* col_start_l = col_data + ph * col_block_fh; - T* col_start_r = col_data + (filter_height - ph - 1) * col_block_fh + - col_matrix_width - sz; + + // fill height padding + { + size_t copy_size = sizeof(T) * output_width; + T* col_start_l = col_data; + T* col_start_r = col_data + (filter_height - 1) * col_block_fh + + col_matrix_width - output_width; for (int ic = 0; ic < im_channels; ++ic) { + // TODO(TJ): move * outside T* dst_data_l = col_start_l + ic * col_block_ic; T* dst_data_r = col_start_r + ic * col_block_ic; for (int kw = 0; kw < filter_width; ++kw) { - std::memset(dst_data_l, 0, copy_sz); - std::memset(dst_data_r, 0, copy_sz); + std::memset(dst_data_l, 0, copy_size); + std::memset(dst_data_r, 0, copy_size); dst_data_l = dst_data_l + col_matrix_width; dst_data_r = dst_data_r + col_matrix_width; } } } - // fill width padding - assert(plw == prw); // because stride_w == 1 - if (plw == 1) { - auto pad = static_cast(0); // padding zero + auto pad = static_cast(0); + if (filter_width == 1) { + // fill width padding for (int ic = 0; ic < im_channels; ++ic) { - // TODO(TJ): use add and resue stride + // TODO(TJ): move * outside T* dst_data_ic = col_data + ic * col_block_ic; for (int kh = 0; kh < filter_height; ++kh) { - T* dst_data_kh = dst_data_ic + kh * col_block_fh; - for (T* dst_data : - {dst_data_kh, dst_data_kh + - (filter_width - prw) * col_matrix_width + - output_width - 1}) { - // TODO(TJ): from plh, saving repeated assignment - for (int oh = 0; oh < output_height; ++oh) { - *dst_data = pad; - dst_data = dst_data + output_width; - } + // TODO(TJ): move * outside + T* dst_data = dst_data_ic + kh * col_block_fh; + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width - 1; + *dst_data = pad; + ++dst_data; } } } - } else { - // padding_size > 1 - for (int ic = 0; ic < im_channels; ++ic) { - // TODO(TJ): use add and resue stride - T* dst_data_ic = col_data + ic * col_block_ic; - for (int kh = 0; kh < filter_height; ++kh) { - T* dst_data_kh = dst_data_ic + kh * col_block_fh; - for (int kw = 0; kw < plw; ++kw) { - // TODO(TJ): reuse array outside this for - size_t sz = sizeof(T) * (plw - kw); - T* dst_data = dst_data_kh + kw * col_matrix_width; - // TODO(TJ): from plh, saving repeated assignment - for (int oh = 0; oh < output_height; ++oh) { - std::memset(dst_data, 0, sz); - dst_data = dst_data + output_width; - } - } - // TODO(TJ): use reverse to save cache - for (int kw = 0; kw < prw; ++kw) { - // TODO(TJ): reuse array outside this for - auto num = (prw - kw); - size_t sz = sizeof(T) * num; - T* dst_data = dst_data_kh + - (filter_width - 1 - kw) * col_matrix_width + - output_width - num; - // TODO(TJ): from plh, saving repeated assignment - for (int oh = 0; oh < output_height; ++oh) { - std::memset(dst_data, 0, sz); - dst_data = dst_data + output_width; - } - } - } - } - } - - // fill im_data - // padding cover two cases: - // 1. kw > 2*pw: kw = 3, pw = 1 - // 0 x x x x ... x x x x 0 - // 1 1 1 1 1 1 - // ==> - // 0 x ... x x - // x x ... x x - // x x ... x 0 - // 2. kw < 2*pw: kw = 3, pw = 2 - // 0 0 x x x ... x x x 0 0 - // 1 1 1 1 1 1 - // ==> - // 0 0 x ... x x x - // 0 x x ... x x 0 - // x x x ... x 0 0 - - // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * - // (output_width-1)} - // length of copy_size is equal kw. - if (plw + prw < filter_width) { + // fill core + size_t copy_size = sizeof(T) * (output_width - plw - prw); for (int oh = 0; oh < output_height; ++oh) { const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; @@ -230,33 +170,73 @@ inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im, for (int kh = 0; kh < filter_height; ++kh) { if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && kh > (filter_height - prh - 1))) { - dst_data = dst_data + filter_width * col_matrix_width; - continue; - } - // TODO(TJ): reuse plw-kw outside this for - // try to unify - for (int kw = 0; kw < plw; ++kw) { - std::memcpy(dst_data + (plw - kw), src_data, - sizeof(T) * (output_width - (plw - kw))); - dst_data = dst_data + col_matrix_width; - } - for (int kw = plw; kw < filter_width - prw; ++kw) { - std::memcpy(dst_data, src_data + (kw - plw), - sizeof(T) * output_width); - dst_data = dst_data + col_matrix_width; - } - int i = 1; - for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) { - std::memcpy(dst_data, src_data + (kw - plw), - sizeof(T) * (output_width - i)); dst_data = dst_data + col_matrix_width; + continue; } + std::memcpy(dst_data + plw, src_data, copy_size); + dst_data = dst_data + col_matrix_width; src_data = src_data + im_width; } } } - } else { - LOG(FATAL) << "Not implement yet"; + return; + } + + // filter_width != 1 + // fill width padding + for (int ic = 0; ic < im_channels; ++ic) { + // TODO(TJ): move * outside + T* dst_data_ic = col_data + ic * col_block_ic; + for (int kh = 0; kh < filter_height; ++kh) { + // TODO(TJ): move * outside + T* dst_data_kh = dst_data_ic + kh * col_block_fh; + for (T* dst_data : + {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width + + output_width - 1}) { + // TODO(TJ): from plh, saving repeated assignment + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width; + } + } + } + } + + // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * + // (output_width-1)} + // length of copy_size is equal kw. + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && + kh > (filter_height - prh - 1))) { + dst_data = dst_data + filter_width * col_matrix_width; + continue; + } + // TODO(TJ): reuse plw-kw outside this for + // try to unify + for (int kw = 0; kw < plw; ++kw) { + std::memcpy(dst_data + (plw - kw), src_data, + sizeof(T) * (output_width - (plw - kw))); + dst_data = dst_data + col_matrix_width; + } + for (int kw = plw; kw < filter_width - prw; ++kw) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * output_width); + dst_data = dst_data + col_matrix_width; + } + int i = 1; + for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * (output_width - i)); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + } } } diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index 789d8e684..ae2c90b33 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -227,7 +227,8 @@ void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { auto t3 = GetCurrentMs(); LOG(INFO) << "before: " << (t3 - t2) / repeat - << ",after: " << (t2 - t1) / repeat; + << ",after: " << (t2 - t1) / repeat + << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%"; } TEST(math, im2col_cputest) { @@ -244,6 +245,10 @@ TEST(math, im2col_cputest) { // height != width testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p, /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); // filter == 1 testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, @@ -251,13 +256,14 @@ TEST(math, im2col_cputest) { testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, /*pw*/ p); } + // padding_h != padding_w testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1, /*pw*/ 2); // benchmark - for (int p : {0, 1, 2}) { - for (int k : {3, 5}) { + for (int p : {0, 1}) { + for (int k : {1, 3, 5}) { LOG(INFO) << "padding == " << p << ", filter == " << k; benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k, /*ph*/ p, /*pw*/ p); -- GitLab