diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp index bd9a56a8a520fe0caadaf161ecd8295653ece826..18126152ea0b4ebfe4ec5c8084479787814ed173 100644 --- a/paddle/function/neon/NeonDepthwiseConv.cpp +++ b/paddle/function/neon/NeonDepthwiseConv.cpp @@ -64,9 +64,10 @@ public: // padding the input float* inputPadding = inputData; + int padInputHeight = inputHeight + 2 * paddingH(); + int padInputWidth = inputWidth + 2 * paddingW(); if (paddingH() > 0 || paddingW() > 0) { - int newSize = batchSize * inputChannels * (inputHeight + 2 * paddingH()) * - (inputWidth + 2 * paddingW()); + int newSize = batchSize * inputChannels * padInputHeight * padInputWidth; resizeBuffer(newSize); inputPadding = reinterpret_cast(memory_->getBuf()); neon::Padding::run(inputData, @@ -74,12 +75,8 @@ public: batchSize * inputChannels, inputHeight, inputWidth, - paddingH(), - paddingW()); - - // height and width of padding data - inputHeight += 2 * paddingH(); - inputWidth += 2 * paddingW(); + padInputHeight, + padInputWidth); } std::function { template struct Padding { - static void run(const T* src, - T* dest, + static void run(const T* input, + T* inputPadding, int channels, int inputHeight, int inputWidth, - int paddingHeight, - int paddingWidth) { - const int destWidth = inputWidth + 2 * paddingWidth; + int padInputHeight, + int padInputWidth) { + const int paddingHeight = (padInputHeight - inputHeight) / 2; + const int paddingWidth = (padInputWidth - inputWidth) / 2; for (int c = 0; c < channels; c++) { if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(T)); - dest += destWidth * paddingHeight; + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T)); + inputPadding += padInputWidth * paddingHeight; } for (int i = 0; i < inputHeight; i++) { // padding head for (int j = 0; j < paddingWidth; j++) { - *dest++ = T(0); + *inputPadding++ = T(0); } - memcpy(dest, src, inputWidth * sizeof(T)); - dest += inputWidth; - src += inputWidth; + memcpy(inputPadding, input, inputWidth * sizeof(T)); + inputPadding += inputWidth; + input += inputWidth; // padding tail for (int j = 0; j < paddingWidth; j++) { - *dest++ = T(0); + *inputPadding++ = T(0); } } if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(T)); - dest += destWidth * paddingHeight; + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T)); + inputPadding += padInputWidth * paddingHeight; } } } @@ -518,47 +519,48 @@ struct Padding { #if defined(__ARM_NEON__) || defined(__ARM_NEON) template <> struct Padding { - static void run(const float* src, - float* dest, + static void run(const float* input, + float* inputPadding, int channels, int inputHeight, int inputWidth, - int paddingHeight, - int paddingWidth) { - const int destWidth = inputWidth + 2 * paddingWidth; + int padInputHeight, + int padInputWidth) { + const int paddingHeight = (padInputHeight - inputHeight) / 2; + const int paddingWidth = (padInputWidth - inputWidth) / 2; for (int c = 0; c < channels; c++) { if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(float)); - dest += destWidth * paddingHeight; + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float)); + inputPadding += padInputWidth * paddingHeight; } for (int i = 0; i < inputHeight; i++) { // padding head for (int j = 0; j < paddingWidth; j++) { - *dest++ = float(0); + *inputPadding++ = float(0); } int step = inputWidth >> 2; int remain = inputWidth & 3; for (int s = 0; s < step; s++) { - float32x4_t s0 = vld1q_f32(src); - vst1q_f32(dest, s0); - src += 4; - dest += 4; + float32x4_t s0 = vld1q_f32(input); + vst1q_f32(inputPadding, s0); + input += 4; + inputPadding += 4; } for (int r = 0; r < remain; r++) { - *dest++ = *src++; + *inputPadding++ = *input++; } // padding tail for (int j = 0; j < paddingWidth; j++) { - *dest++ = float(0); + *inputPadding++ = float(0); } } if (paddingHeight > 0) { - memset(dest, 0, destWidth * paddingHeight * sizeof(float)); - dest += destWidth * paddingHeight; + memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float)); + inputPadding += padInputWidth * paddingHeight; } } } diff --git a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp new file mode 100644 index 0000000000000000000000000000000000000000..03d571ecfe979ed5386cf8f857c5694c57843a12 --- /dev/null +++ b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp @@ -0,0 +1,124 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "NeonDepthwiseConv.h" +#include "paddle/function/ConvOp.h" + +namespace paddle { + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +template +class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase { +public: + void init(const FuncConfig& config) override { + ConvFunctionBase::init(config); + } + + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + checkShape(input, filter, output); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + check(inputs, outputs); + + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + + int batchSize = input[0]; + int inputChannels = input[1]; + int inputHeight = input[2]; + int inputWidth = input[3]; + int filterHeight = getFilterHeight(filter); + int filterWidth = getFilterWidth(filter); + int outputChannels = output[1]; + int outputHeight = output[2]; + int outputWidth = output[3]; + int filterMultiplier = outputChannels / groups_; + CHECK_EQ(inputChannels, groups_); + + // only support strideH() == strideW() and filterHeight == filterWidth. + CHECK_EQ(strideH(), strideW()); + CHECK_EQ(paddingH(), paddingW()); + CHECK_EQ(filterHeight, filterWidth); + + float* inputData = inputs[0].data(); + float* filterData = inputs[1].data(); + float* outputData = outputs[0].data(); + + // padding the input, input -> inputPadding + float* inputPadding = inputData; + int padInputHeight = + (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH(); + int padInputWidth = + (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW(); + + if (padInputHeight > inputHeight || padInputWidth > inputWidth) { + int newSize = batchSize * inputChannels * padInputHeight * padInputWidth; + resizeBuffer(newSize); + inputPadding = reinterpret_cast(memory_->getBuf()); + neon::Padding::run(inputData, + inputPadding, + batchSize * inputChannels, + inputHeight, + inputWidth, + padInputHeight, + padInputWidth); + } + + std::function + DepthWiseConv; + + if (filterWidth == 3) { + DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run; + } else if (filterWidth == 4) { + DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run; + } else { + LOG(FATAL) << "Not supported"; + } + + for (int i = 0; i < batchSize; i++) { + DepthWiseConv(inputPadding, + filterData, + padInputHeight, + padInputWidth, + outputChannels, + outputHeight, + outputWidth, + filterMultiplier, + outputData); + inputPadding += inputChannels * padInputHeight * padInputWidth; + outputData += outputChannels * outputHeight * outputWidth; + } + } +}; + +#ifndef PADDLE_TYPE_DOUBLE + +REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose, + CPU, + NeonDepthwiseConvTransposeFunction); + +#endif + +#endif + +} // namespace paddle