Add NeonDepthwiseConvTransposeFunction.

840104c9 · hedaoyuan · 40d47fae · 840104c9 · 840104c9 · 840104c9
3 changed file
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
@@ -64,9 +64,10 @@ public:
    // padding the input
    float* inputPadding = inputData;
+    int padInputHeight = inputHeight + 2 * paddingH();
+    int padInputWidth = inputWidth + 2 * paddingW();
    if (paddingH() > 0 || paddingW() > 0) {
-      int newSize = batchSize * inputChannels * (inputHeight + 2 * paddingH()) *
+      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
-                    (inputWidth + 2 * paddingW());
      resizeBuffer<Device>(newSize);
      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
      neon::Padding<float>::run(inputData,
@@ -74,12 +75,8 @@ public:
                                batchSize * inputChannels,
                                inputHeight,
                                inputWidth,
-                                paddingH(),
+                                padInputHeight,
-                                paddingW());
+                                padInputWidth);
-      // height and width of padding data
-      inputHeight += 2 * paddingH();
-      inputWidth += 2 * paddingW();
    }
    std::function<void(
@@ -101,14 +98,14 @@ public:
    for (int i = 0; i < batchSize; i++) {
      DepthWiseConv(inputPadding,
                    filterData,
-                    inputHeight,
+                    padInputHeight,
-                    inputWidth,
+                    padInputWidth,
                    outputChannels,
                    outputHeight,
                    outputWidth,
                    filterMultiplier,
                    outputData);
-      inputPadding += inputChannels * inputHeight * inputWidth;
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
      outputData += outputChannels * outputHeight * outputWidth;
    }
  }

--- a/paddle/function/neon/NeonDepthwiseConv.h
+++ b/paddle/function/neon/NeonDepthwiseConv.h
@@ -477,39 +477,40 @@ struct DepthwiseConvKernel<4, 2> {
 template <class T>
 struct Padding {
-  static void run(const T* src,
+  static void run(const T* input,
-                  T* dest,
+                  T* inputPadding,
                  int channels,
                  int inputHeight,
                  int inputWidth,
-                  int paddingHeight,
+                  int padInputHeight,
-                  int paddingWidth) {
+                  int padInputWidth) {
-    const int destWidth = inputWidth + 2 * paddingWidth;
+    const int paddingHeight = (padInputHeight - inputHeight) / 2;
+    const int paddingWidth = (padInputWidth - inputWidth) / 2;
    for (int c = 0; c < channels; c++) {
      if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
-        dest += destWidth * paddingHeight;
+        inputPadding += padInputWidth * paddingHeight;
      }
      for (int i = 0; i < inputHeight; i++) {
        // padding head
        for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = T(0);
+          *inputPadding++ = T(0);
        }
-        memcpy(dest, src, inputWidth * sizeof(T));
+        memcpy(inputPadding, input, inputWidth * sizeof(T));
-        dest += inputWidth;
+        inputPadding += inputWidth;
-        src += inputWidth;
+        input += inputWidth;
        // padding tail
        for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = T(0);
+          *inputPadding++ = T(0);
        }
      }
      if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
-        dest += destWidth * paddingHeight;
+        inputPadding += padInputWidth * paddingHeight;
      }
    }
  }
@@ -518,47 +519,48 @@ struct Padding {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 template <>
 struct Padding<float> {
-  static void run(const float* src,
+  static void run(const float* input,
-                  float* dest,
+                  float* inputPadding,
                  int channels,
                  int inputHeight,
                  int inputWidth,
-                  int paddingHeight,
+                  int padInputHeight,
-                  int paddingWidth) {
+                  int padInputWidth) {
-    const int destWidth = inputWidth + 2 * paddingWidth;
+    const int paddingHeight = (padInputHeight - inputHeight) / 2;
+    const int paddingWidth = (padInputWidth - inputWidth) / 2;
    for (int c = 0; c < channels; c++) {
      if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
-        dest += destWidth * paddingHeight;
+        inputPadding += padInputWidth * paddingHeight;
      }
      for (int i = 0; i < inputHeight; i++) {
        // padding head
        for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = float(0);
+          *inputPadding++ = float(0);
        }
        int step = inputWidth >> 2;
        int remain = inputWidth & 3;
        for (int s = 0; s < step; s++) {
-          float32x4_t s0 = vld1q_f32(src);
+          float32x4_t s0 = vld1q_f32(input);
-          vst1q_f32(dest, s0);
+          vst1q_f32(inputPadding, s0);
-          src += 4;
+          input += 4;
-          dest += 4;
+          inputPadding += 4;
        }
        for (int r = 0; r < remain; r++) {
-          *dest++ = *src++;
+          *inputPadding++ = *input++;
        }
        // padding tail
        for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = float(0);
+          *inputPadding++ = float(0);
        }
      }
      if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
-        dest += destWidth * paddingHeight;
+        inputPadding += padInputWidth * paddingHeight;
      }
    }
  }

--- a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
+++ b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "NeonDepthwiseConv.h"
+#include "paddle/function/ConvOp.h"
+namespace paddle {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <DeviceType Device>
+class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    int batchSize = input[0];
+    int inputChannels = input[1];
+    int inputHeight = input[2];
+    int inputWidth = input[3];
+    int filterHeight = getFilterHeight(filter);
+    int filterWidth = getFilterWidth(filter);
+    int outputChannels = output[1];
+    int outputHeight = output[2];
+    int outputWidth = output[3];
+    int filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(paddingH(), paddingW());
+    CHECK_EQ(filterHeight, filterWidth);
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+    // padding the input, input -> inputPadding
+    float* inputPadding = inputData;
+    int padInputHeight =
+        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
+    int padInputWidth =
+        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
+    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
+      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
+      resizeBuffer<Device>(newSize);
+      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+      neon::Padding<float>::run(inputData,
+                                inputPadding,
+                                batchSize * inputChannels,
+                                inputHeight,
+                                inputWidth,
+                                padInputHeight,
+                                padInputWidth);
+    }
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+    if (filterWidth == 3) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 4) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+    for (int i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    padInputHeight,
+                    padInputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+#ifndef PADDLE_TYPE_DOUBLE
+REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
+                    CPU,
+                    NeonDepthwiseConvTransposeFunction);
+#endif
+#endif
+}  // namespace paddle