diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index c572a9d433bc16e6733b8fc9367970bef28e699a..f43f15e5cacb70b625d7791e1e02ce7780286200 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -21,6 +21,8 @@ if(USE_NNPACK) endif() endif() +list(APPEND cpp_files neon/NeonDepthwiseConv.cpp) + add_library(paddle_function STATIC ${cpp_files} ${cu_objs}) add_dependencies(paddle_function ${external_project_dependencies}) add_dependencies(paddle_function paddle_proto) @@ -42,11 +44,11 @@ if(WITH_GPU) add_simple_unittest(RowConvOpTest) add_simple_unittest(BlockExpandOpTest) add_simple_unittest(CropOpTest) - add_simple_unittest(DepthwiseConvOpTest) endif() add_simple_unittest(Im2ColTest) add_simple_unittest(GemmConvOpTest) +add_simple_unittest(DepthwiseConvOpTest) endif() add_style_check_target(paddle_function ${h_files}) diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp index f44ae0c342e9536366e2b537694cee81fcb1a6ed..d8e8c889d5c23bf9b2b5fd0b0393395883188fd8 100644 --- a/paddle/function/DepthwiseConvOpTest.cpp +++ b/paddle/function/DepthwiseConvOpTest.cpp @@ -34,4 +34,13 @@ TEST(DepthwiseConv, BackwardFilter) { } #endif +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +TEST(DepthwiseConv, Forward) { + DepthwiseConvolution( + "GemmConv-CPU", "NeonDepthwiseConv-CPU", forward); +} + +#endif + } // namespace paddle diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 48e2e32f9256fb49c67ba25e9b5a47d72499758b..9b91e223a6a28586b11fe7ed4a44421e029a67bb 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "TensorShape.h" #include "TensorType.h" +#include "neon/neon_util.h" namespace paddle { @@ -93,4 +94,95 @@ public: int paddingWidth); }; +template +struct Padding { + static void run(const T* src, + T* dest, + int channels, + int inputHeight, + int inputWidth, + int paddingHeight, + int paddingWidth) { + const int destWidth = inputWidth + 2 * paddingWidth; + for (int c = 0; c < channels; c++) { + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(T)); + dest += destWidth * paddingHeight; + } + + for (int i = 0; i < inputHeight; i++) { + // padding head + for (int j = 0; j < paddingWidth; j++) { + *dest++ = T(0); + } + + memcpy(dest, src, inputWidth * sizeof(T)); + dest += inputWidth; + src += inputWidth; + + // padding tail + for (int j = 0; j < paddingWidth; j++) { + *dest++ = T(0); + } + } + + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(T)); + dest += destWidth * paddingHeight; + } + } + } +}; + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +template <> +struct Padding { + static void run(const float* src, + float* dest, + int channels, + int inputHeight, + int inputWidth, + int paddingHeight, + int paddingWidth) { + const int destWidth = inputWidth + 2 * paddingWidth; + for (int c = 0; c < channels; c++) { + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(float)); + dest += destWidth * paddingHeight; + } + + for (int i = 0; i < inputHeight; i++) { + // padding head + for (int j = 0; j < paddingWidth; j++) { + *dest++ = float(0); + } + + int step = inputWidth >> 2; + int remain = inputWidth & 3; + for (int s = 0; s < step; s++) { + float32x4_t s0 = vld1q_f32(src); + vst1q_f32(dest, s0); + src += 4; + dest += 4; + } + for (int r = 0; r < remain; r++) { + *dest++ = *src++; + } + + // padding tail + for (int j = 0; j < paddingWidth; j++) { + *dest++ = float(0); + } + } + + if (paddingHeight > 0) { + memset(dest, 0, destWidth * paddingHeight * sizeof(float)); + dest += destWidth * paddingHeight; + } + } + } +}; + +#endif + } // namespace paddle diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f09e98587d1681d29a79a9cb0303c2d4356c6935 --- /dev/null +++ b/paddle/function/neon/NeonDepthwiseConv.cpp @@ -0,0 +1,577 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "neon_util.h" +#include "paddle/function/ConvOp.h" +#include "paddle/function/Im2Col.h" + +namespace paddle { + +namespace neon { + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +template +struct DepthwiseConvKernel {}; + +inline float32_t conv3x3(float32x4_t r0, + float32x4_t r1, + float32x4_t r2, + float32x4_t k0, + float32x4_t k1, + float32x4_t k2) { + float32x4_t tmp; + tmp = vmulq_f32(r0, k0); + tmp = vmlaq_f32(tmp, r1, k1); + tmp = vmlaq_f32(tmp, r2, k2); + return vaddvq_f32(tmp); +} + +inline float32_t conv4x4(float32x4_t r0, + float32x4_t r1, + float32x4_t r2, + float32x4_t r3, + float32x4_t k0, + float32x4_t k1, + float32x4_t k2, + float32x4_t k3) { + float32x4_t tmp; + tmp = vmulq_f32(r0, k0); + tmp = vmlaq_f32(tmp, r1, k1); + tmp = vmlaq_f32(tmp, r2, k2); + tmp = vmlaq_f32(tmp, r3, k3); + return vaddvq_f32(tmp); +} + +/** + * Each step calculates four elements of the output. + * First step: + * R0[0, 1, 2, 3...] * K[0][0] + * R0[1, 2, 3, 4...] * K[0][1] + * R0[2, 3, 4, 5...] * K[0][2] + * R1[0, 1, 2, 3...] * K[1][0] + * R1[1, 2, 3, 4...] * K[1][1] + * R1[2, 3, 4, 5...] * K[1][2] + * R2[0, 1, 2, 3...] * K[2][0] + * R2[1, 2, 3, 4...] * K[2][1] + * + R2[2, 3, 4, 5...] * K[2][2] + * ------------------------------ + * Output[0, 1, 2, 3] + */ +template <> +struct DepthwiseConvKernel<3, 1> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 9) { + // Load the filters + float32x4_t k[3]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 3); + k[2] = vld1q_f32(filterData + 6); + k[0] = vsetq_lane_f32(0.f, k[0], 3); + k[1] = vsetq_lane_f32(0.f, k[1], 3); + k[2] = vsetq_lane_f32(0.f, k[2], 3); + + const float* r0 = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + const float* r1 = r0 + inputWidth; + const float* r2 = r0 + inputWidth * 2; + float32x4_t input[3][3]; + for (int h = 0; h < outputHeight; h++) { + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4_t tmp; + input[0][0] = vld1q_f32(r0); + tmp = vld1q_f32(r0 + 4); + input[0][1] = vextq_f32(input[0][0], tmp, 1); + input[0][2] = vextq_f32(input[0][0], tmp, 2); + input[1][0] = vld1q_f32(r1); + tmp = vld1q_f32(r1 + 4); + input[1][1] = vextq_f32(input[1][0], tmp, 1); + input[1][2] = vextq_f32(input[1][0], tmp, 2); + input[2][0] = vld1q_f32(r2); + tmp = vld1q_f32(r2 + 4); + input[2][1] = vextq_f32(input[2][0], tmp, 1); + input[2][2] = vextq_f32(input[2][0], tmp, 2); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 4; + r1 += 4; + r2 += 4; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]); + r0++; + r1++; + r2++; + outputData++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + } + } +}; + +/** + * Each step calculates four elements of the output. + * First step: + * R0[0, 2, 4, 6...] * K[0][0] + * R0[1, 3, 5, 7...] * K[0][1] + * R0[2, 4, 6, 8...] * K[0][2] + * R1[0, 2, 4, 6...] * K[1][0] + * R1[1, 3, 5, 7...] * K[1][1] + * R1[2, 4, 6, 8...] * K[1][2] + * R2[0, 2, 4, 6...] * K[2][0] + * R2[1, 3, 5, 7...] * K[2][1] + * R2[2, 4, 6, 8...] * K[2][2] + * ------------------------------ + * Output[0, 1, 2, 3] + */ +template <> +struct DepthwiseConvKernel<3, 2> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 9) { + // Load the filters + float32x4_t k[3]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 3); + k[2] = vld1q_f32(filterData + 6); + k[0] = vsetq_lane_f32(0.f, k[0], 3); + k[1] = vsetq_lane_f32(0.f, k[1], 3); + k[2] = vsetq_lane_f32(0.f, k[2], 3); + + const float* start = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + float32x4_t input[3][3]; + for (int h = 0; h < outputHeight; h++) { + const float* r0 = start + 2 * h * inputWidth; + const float* r1 = start + (2 * h + 1) * inputWidth; + const float* r2 = start + (2 * h + 2) * inputWidth; + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4_t data1; + float32x4x2_t data2; + + data2 = vld2q_f32(r0); + input[0][0] = data2.val[0]; + input[0][1] = data2.val[1]; + data1 = vld1q_f32(r0 + 8); + input[0][2] = vextq_f32(data2.val[0], data1, 1); + + data2 = vld2q_f32(r1); + input[1][0] = data2.val[0]; + input[1][1] = data2.val[1]; + data1 = vld1q_f32(r1 + 8); + input[1][2] = vextq_f32(data2.val[0], data1, 1); + + data2 = vld2q_f32(r2); + input[2][0] = data2.val[0]; + input[2][1] = data2.val[1]; + data1 = vld1q_f32(r2 + 8); + input[2][2] = vextq_f32(data2.val[0], data1, 1); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 8; + r1 += 8; + r2 += 8; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]); + r0 += 2; + r1 += 2; + r2 += 2; + outputData++; + } + } + } + } +}; + +/** + * Each step calculates four elements of the output. + */ +template <> +struct DepthwiseConvKernel<4, 1> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 16) { + // Load the filters + float32x4_t k[4]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 4); + k[2] = vld1q_f32(filterData + 8); + k[3] = vld1q_f32(filterData + 12); + + const float* r0 = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + const float* r1 = r0 + inputWidth; + const float* r2 = r0 + inputWidth * 2; + const float* r3 = r0 + inputWidth * 3; + float32x4_t input[4][4]; + for (int h = 0; h < outputHeight; h++) { + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4_t tmp; + input[0][0] = vld1q_f32(r0); + tmp = vld1q_f32(r0 + 4); + input[0][1] = vextq_f32(input[0][0], tmp, 1); + input[0][2] = vextq_f32(input[0][0], tmp, 2); + input[0][3] = vextq_f32(input[0][0], tmp, 3); + + input[1][0] = vld1q_f32(r1); + tmp = vld1q_f32(r1 + 4); + input[1][1] = vextq_f32(input[1][0], tmp, 1); + input[1][2] = vextq_f32(input[1][0], tmp, 2); + input[1][3] = vextq_f32(input[1][0], tmp, 3); + + input[2][0] = vld1q_f32(r2); + tmp = vld1q_f32(r2 + 4); + input[2][1] = vextq_f32(input[2][0], tmp, 1); + input[2][2] = vextq_f32(input[2][0], tmp, 2); + input[2][3] = vextq_f32(input[2][0], tmp, 3); + + input[3][0] = vld1q_f32(r3); + tmp = vld1q_f32(r3 + 4); + input[3][1] = vextq_f32(input[3][0], tmp, 1); + input[3][2] = vextq_f32(input[3][0], tmp, 2); + input[3][3] = vextq_f32(input[3][0], tmp, 3); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + float32x4_t i3 = vld1q_f32(r3); + *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]); + r0++; + r1++; + r2++; + r3++; + outputData++; + } + + r0 += 3; + r1 += 3; + r2 += 3; + r3 += 3; + } + } + } +}; + +/** + * Each step calculates four elements of the output. + */ +template <> +struct DepthwiseConvKernel<4, 2> { + static void run(const float* inputData, + const float* filterData, + int inputHeight, + int inputWidth, + int outputChannels, + int outputHeight, + int outputWidth, + int filterMultiplier, + float* outputData) { + const int steps = outputWidth >> 2; + const int remain = outputWidth & 3; + for (int c = 0; c < outputChannels; c++, filterData += 16) { + // Load the filters + float32x4_t k[4]; + k[0] = vld1q_f32(filterData); + k[1] = vld1q_f32(filterData + 4); + k[2] = vld1q_f32(filterData + 8); + k[3] = vld1q_f32(filterData + 12); + + const float* start = + inputData + (c / filterMultiplier) * (inputHeight * inputWidth); + float32x4_t input[4][4]; + for (int h = 0; h < outputHeight; h++) { + const float* r0 = start + 2 * h * inputWidth; + const float* r1 = start + (2 * h + 1) * inputWidth; + const float* r2 = start + (2 * h + 2) * inputWidth; + const float* r3 = start + (2 * h + 3) * inputWidth; + for (int s = 0; s < steps; s++) { + // Load the inputs + float32x4x2_t data1; + float32x4x2_t data2; + + data1 = vld2q_f32(r0); + data2 = vld2q_f32(r0 + 8); + input[0][0] = data1.val[0]; + input[0][1] = data1.val[1]; + input[0][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[0][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + data1 = vld2q_f32(r1); + data2 = vld2q_f32(r1 + 8); + input[1][0] = data1.val[0]; + input[1][1] = data1.val[1]; + input[1][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[1][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + data1 = vld2q_f32(r2); + data2 = vld2q_f32(r2 + 8); + input[2][0] = data1.val[0]; + input[2][1] = data1.val[1]; + input[2][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[2][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + data1 = vld2q_f32(r3); + data2 = vld2q_f32(r3 + 8); + input[3][0] = data1.val[0]; + input[3][1] = data1.val[1]; + input[3][2] = vextq_f32(data1.val[0], data2.val[0], 1); + input[3][3] = vextq_f32(data1.val[1], data2.val[1], 1); + + float32x4_t tmp1 = vdupq_n_f32(0.f); + float32x4_t tmp2 = vdupq_n_f32(0.f); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); + tmp1 = vaddq_f32(tmp1, tmp2); + + vst1q_f32(outputData, tmp1); + r0 += 8; + r1 += 8; + r2 += 8; + r3 += 8; + outputData += 4; + } + + for (int r = 0; r < remain; r++) { + float32x4_t i0 = vld1q_f32(r0); + float32x4_t i1 = vld1q_f32(r1); + float32x4_t i2 = vld1q_f32(r2); + float32x4_t i3 = vld1q_f32(r3); + *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]); + r0 += 2; + r1 += 2; + r2 += 2; + r3 += 2; + outputData++; + } + } + } + } +}; + +template +class NeonDepthwiseConvFunction : public ConvFunctionBase { +public: + void init(const FuncConfig& config) override { + ConvFunctionBase::init(config); + } + + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + checkShape(input, filter, output); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + check(inputs, outputs); + + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + + size_t batchSize = input[0]; + size_t inputChannels = input[1]; + size_t inputHeight = input[2]; + size_t inputWidth = input[3]; + size_t filterHeight = getFilterHeight(filter); + size_t filterWidth = getFilterWidth(filter); + size_t outputChannels = output[1]; + size_t outputHeight = output[2]; + size_t outputWidth = output[3]; + size_t filterMultiplier = outputChannels / groups_; + CHECK_EQ(inputChannels, groups_); + + // only support strideH() == strideW() and filterHeight == filterWidth. + CHECK_EQ(strideH(), strideW()); + CHECK_EQ(filterHeight, filterWidth); + + float* inputData = inputs[0].data(); + float* filterData = inputs[1].data(); + float* outputData = outputs[0].data(); + + // padding the input + float* inputPadding = inputData; + if (paddingH() > 0 || paddingW() > 0) { + int newSize = batchSize * inputChannels * (inputHeight + 2 * paddingH()) * + (inputWidth + 2 * paddingW()); + resizeBuffer(newSize); + inputPadding = reinterpret_cast(memory_->getBuf()); + Padding::run(inputData, + inputPadding, + batchSize * inputChannels, + inputHeight, + inputWidth, + paddingH(), + paddingW()); + + // height and width of padding data + inputHeight += 2 * paddingH(); + inputWidth += 2 * paddingW(); + } + + std::function + DepthWiseConv; + + if (filterWidth == 3 && strideW() == 1) { + DepthWiseConv = DepthwiseConvKernel<3, 1>::run; + } else if (filterWidth == 3 && strideW() == 2) { + DepthWiseConv = DepthwiseConvKernel<3, 2>::run; + } else if (filterWidth == 4 && strideW() == 1) { + DepthWiseConv = DepthwiseConvKernel<4, 1>::run; + } else if (filterWidth == 4 && strideW() == 2) { + DepthWiseConv = DepthwiseConvKernel<4, 2>::run; + } else { + LOG(FATAL) << "Not supported"; + } + + for (size_t i = 0; i < batchSize; i++) { + DepthWiseConv(inputPadding, + filterData, + inputHeight, + inputWidth, + outputChannels, + outputHeight, + outputWidth, + filterMultiplier, + outputData); + inputPadding += inputChannels * inputHeight * inputWidth; + outputData += outputChannels * outputHeight * outputWidth; + } + } +}; + +REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction); + +#endif + +} // namespace neon +} // namespace paddle diff --git a/paddle/function/neon/neon_util.h b/paddle/function/neon/neon_util.h new file mode 100644 index 0000000000000000000000000000000000000000..56b3febe2d27bb4fbf57e49079b3ad071d556914 --- /dev/null +++ b/paddle/function/neon/neon_util.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +#include + +namespace paddle { + +namespace neon { + +inline float32x4_t vld1q_f32_aligned(const float* p) { + return vld1q_f32( + (const float*)__builtin_assume_aligned(p, sizeof(float32x4_t))); +} + +#ifndef __aarch64__ +inline float32_t vaddvq_f32(float32x4_t a) { + float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a)); + return vget_lane_f32(vpadd_f32(v, v), 0); +} + +inline float32x4_t vmlaq_laneq_f32(float32x4_t a, + float32x4_t b, + float32x4_t v, + const int lane) { + return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane)); +} +#endif + +} // namespace neon +} // namespace paddle + +#endif diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp index 0ece2799318ea5ecc91f97f71289d4d07246dcaa..20de475fc3f6b6f3c05ac26bea8363daff0cf110 100644 --- a/paddle/gserver/layers/ExpandConvLayer.cpp +++ b/paddle/gserver/layers/ExpandConvLayer.cpp @@ -29,6 +29,10 @@ namespace paddle { REGISTER_LAYER(exconv, ExpandConvLayer); REGISTER_LAYER(exconvt, ExpandConvLayer); +inline bool isDepthwiseConv(int channels, int groups) { + return channels == groups; +} + bool ExpandConvLayer::init(const LayerMap &layerMap, const ParameterMap ¶meterMap) { /* Initialize the basic convolutional parent class */ @@ -47,14 +51,27 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, std::vector paddings = {(size_t)paddingY_[i], (size_t)padding_[i]}; std::vector strides = {(size_t)strideY_[i], (size_t)stride_[i]}; - if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) { + // Convolution Layer uses the GemmConv function by default. + convType = "GemmConv"; + convGradInputType = "GemmConvGradInput"; + convGradFilterType = "GemmConvGradFilter"; + + // If depth wise convolution and useGpu == true + if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) { convType = "DepthwiseConv"; convGradInputType = "DepthwiseConvGradInput"; convGradFilterType = "DepthwiseConvGradFilter"; - } else { - convType = "GemmConv"; - convGradInputType = "GemmConvGradInput"; - convGradFilterType = "GemmConvGradFilter"; + } + + // If depth wise convolution and useGpu == false and ARM-NEON + if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) { +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + if ((filterSize_[i] == filterSizeY_[i]) && + (filterSize_[i] == 3 || filterSize_[i] == 4) && + (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) { + convType = "NeonDepthwiseConv"; + } +#endif } if (FLAGS_use_nnpack && !isDeconv_) {