diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 9b91e223a6a28586b11fe7ed4a44421e029a67bb..1e0cff436ff60d5a029e89657d00af2b0bf8b454 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -94,95 +94,4 @@ public: int paddingWidth); }; -template -struct Padding { - static void run(const T* src, - T* dest, - int channels, - int inputHeight, - int inputWidth, - int paddingHeight, - int paddingWidth) { - const int destWidth = inputWidth + 2 * paddingWidth; - for (int c = 0; c < channels; c++) { - if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(T)); - dest += destWidth * paddingHeight; - } - - for (int i = 0; i < inputHeight; i++) { - // padding head - for (int j = 0; j < paddingWidth; j++) { - *dest++ = T(0); - } - - memcpy(dest, src, inputWidth * sizeof(T)); - dest += inputWidth; - src += inputWidth; - - // padding tail - for (int j = 0; j < paddingWidth; j++) { - *dest++ = T(0); - } - } - - if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(T)); - dest += destWidth * paddingHeight; - } - } - } -}; - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -template <> -struct Padding { - static void run(const float* src, - float* dest, - int channels, - int inputHeight, - int inputWidth, - int paddingHeight, - int paddingWidth) { - const int destWidth = inputWidth + 2 * paddingWidth; - for (int c = 0; c < channels; c++) { - if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(float)); - dest += destWidth * paddingHeight; - } - - for (int i = 0; i < inputHeight; i++) { - // padding head - for (int j = 0; j < paddingWidth; j++) { - *dest++ = float(0); - } - - int step = inputWidth >> 2; - int remain = inputWidth & 3; - for (int s = 0; s < step; s++) { - float32x4_t s0 = vld1q_f32(src); - vst1q_f32(dest, s0); - src += 4; - dest += 4; - } - for (int r = 0; r < remain; r++) { - *dest++ = *src++; - } - - // padding tail - for (int j = 0; j < paddingWidth; j++) { - *dest++ = float(0); - } - } - - if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(float)); - dest += destWidth * paddingHeight; - } - } - } -}; - -#endif - } // namespace paddle diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp index 3d502f5d6d1c41e5f5223501f7689f3a02e7296e..bd9a56a8a520fe0caadaf161ecd8295653ece826 100644 --- a/paddle/function/neon/NeonDepthwiseConv.cpp +++ b/paddle/function/neon/NeonDepthwiseConv.cpp @@ -14,7 +14,6 @@ limitations under the License. */ #include "NeonDepthwiseConv.h" #include "paddle/function/ConvOp.h" -#include "paddle/function/Im2Col.h" namespace paddle { @@ -70,13 +69,13 @@ public: (inputWidth + 2 * paddingW()); resizeBuffer(newSize); inputPadding = reinterpret_cast(memory_->getBuf()); - Padding::run(inputData, - inputPadding, - batchSize * inputChannels, - inputHeight, - inputWidth, - paddingH(), - paddingW()); + neon::Padding::run(inputData, + inputPadding, + batchSize * inputChannels, + inputHeight, + inputWidth, + paddingH(), + paddingW()); // height and width of padding data inputHeight += 2 * paddingH(); diff --git a/paddle/function/neon/NeonDepthwiseConv.h b/paddle/function/neon/NeonDepthwiseConv.h index cb1abe1f3261156eb00c3a7ac4019d2c35d43478..3ceaa65ddb9027563a1ec9cea68cf8175fa30769 100644 --- a/paddle/function/neon/NeonDepthwiseConv.h +++ b/paddle/function/neon/NeonDepthwiseConv.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "neon_util.h" namespace paddle { @@ -474,6 +475,97 @@ struct DepthwiseConvKernel<4, 2> { } }; +template +struct Padding { + static void run(const T* src, + T* dest, + int channels, + int inputHeight, + int inputWidth, + int paddingHeight, + int paddingWidth) { + const int destWidth = inputWidth + 2 * paddingWidth; + for (int c = 0; c < channels; c++) { + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(T)); + dest += destWidth * paddingHeight; + } + + for (int i = 0; i < inputHeight; i++) { + // padding head + for (int j = 0; j < paddingWidth; j++) { + *dest++ = T(0); + } + + memcpy(dest, src, inputWidth * sizeof(T)); + dest += inputWidth; + src += inputWidth; + + // padding tail + for (int j = 0; j < paddingWidth; j++) { + *dest++ = T(0); + } + } + + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(T)); + dest += destWidth * paddingHeight; + } + } + } +}; + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +template <> +struct Padding { + static void run(const float* src, + float* dest, + int channels, + int inputHeight, + int inputWidth, + int paddingHeight, + int paddingWidth) { + const int destWidth = inputWidth + 2 * paddingWidth; + for (int c = 0; c < channels; c++) { + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(float)); + dest += destWidth * paddingHeight; + } + + for (int i = 0; i < inputHeight; i++) { + // padding head + for (int j = 0; j < paddingWidth; j++) { + *dest++ = float(0); + } + + int step = inputWidth >> 2; + int remain = inputWidth & 3; + for (int s = 0; s < step; s++) { + float32x4_t s0 = vld1q_f32(src); + vst1q_f32(dest, s0); + src += 4; + dest += 4; + } + for (int r = 0; r < remain; r++) { + *dest++ = *src++; + } + + // padding tail + for (int j = 0; j < paddingWidth; j++) { + *dest++ = float(0); + } + } + + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(float)); + dest += destWidth * paddingHeight; + } + } + } +}; + +#endif + #endif } // namespace neon