diff --git a/paddle/function/neon/NeonDepthwiseConv.h b/paddle/function/neon/NeonDepthwiseConv.h index 30f0158c61e4753cb87f45453199053c58b91673..aefeea78badbca3d0d09e292e4e1e148618f8ac6 100644 --- a/paddle/function/neon/NeonDepthwiseConv.h +++ b/paddle/function/neon/NeonDepthwiseConv.h @@ -566,6 +566,63 @@ struct Padding { } }; +// for stride is 2 +struct StridePadding { + static void run(const float* input, + float* inputPadding, + int channels, + int inputHeight, + int inputWidth, + int padInputHeight, + int padInputWidth) { + const int paddingHeight = (padInputHeight - (inputHeight * 2 - 1)) / 2; + const int paddingWidth = (padInputWidth - (inputWidth * 2 - 1)) / 2; + for (int c = 0; c < channels; c++) { + if (paddingHeight > 0) { + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float)); + inputPadding += padInputWidth * paddingHeight; + } + + for (int i = 0; i < inputHeight; i++) { + // padding head + for (int j = 0; j < paddingWidth; j++) { + *inputPadding++ = float(0); + } + + int step = inputWidth >> 2; + int remain = inputWidth & 3; + float32x4_t s1 = vdupq_n_f32(0.f); + for (int s = 0; s < step; s++) { + float32x4_t s0 = vld1q_f32(input); + float32x4x2_t v = {s0, s1}; + vst2q_f32(inputPadding, v); + input += 4; + inputPadding += 8; + } + for (int r = 0; r < remain; r++) { + *inputPadding++ = *input++; + *inputPadding++ = float(0); + } + inputPadding--; + + // padding tail + for (int j = 0; j < paddingWidth; j++) { + *inputPadding++ = float(0); + } + if (i != inputHeight - 1) { + memset(inputPadding, 0, padInputWidth * sizeof(float)); + inputPadding += padInputWidth; + } + } + + if (paddingHeight > 0) { + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float)); + inputPadding += padInputWidth * paddingHeight; + } + } + } +}; + #endif #endif diff --git a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp index 03d571ecfe979ed5386cf8f857c5694c57843a12..49ca4bc8a0947ba329bd991e9f7d001623901a67 100644 --- a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp +++ b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp @@ -74,13 +74,25 @@ public: int newSize = batchSize * inputChannels * padInputHeight * padInputWidth; resizeBuffer(newSize); inputPadding = reinterpret_cast(memory_->getBuf()); - neon::Padding::run(inputData, - inputPadding, - batchSize * inputChannels, - inputHeight, - inputWidth, - padInputHeight, - padInputWidth); + if (strideH() == 1) { + neon::Padding::run(inputData, + inputPadding, + batchSize * inputChannels, + inputHeight, + inputWidth, + padInputHeight, + padInputWidth); + } else if (strideH() == 2) { + neon::StridePadding::run(inputData, + inputPadding, + batchSize * inputChannels, + inputHeight, + inputWidth, + padInputHeight, + padInputWidth); + } else { + LOG(FATAL) << "Not supported"; + } } std::function