From 5337daea3279b51a25f1f23ca1de81135e15c995 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Fri, 28 Dec 2018 00:13:05 +0800 Subject: [PATCH] Optimize 5x5 depthwise conv for speedup 6x --- src/framework/operator.cpp | 15 +- src/io/api_paddle_mobile.cc | 1 + src/io/api_paddle_mobile.h | 10 +- src/io/paddle_inference_api.h | 5 +- src/io/paddle_mobile.cpp | 26 +- src/io/paddle_mobile.h | 11 +- src/operators/kernel/arm/conv_kernel.cpp | 11 + .../kernel/central-arm-func/conv_arm_func.h | 1 + src/operators/math/depthwise_conv5x5.cpp | 740 ++++++++++++++++++ src/operators/math/depthwise_conv5x5.h | 48 ++ src/operators/op_param.h | 6 +- 11 files changed, 844 insertions(+), 30 deletions(-) create mode 100644 src/operators/math/depthwise_conv5x5.cpp create mode 100644 src/operators/math/depthwise_conv5x5.h diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index e0b40cebf7..2b9e1d0451 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -64,9 +64,10 @@ void OperatorBase::Run() { for (const auto key : input_keys) { auto var_vec_in = inputs_.at(key); for (int i = 0; i < var_vec_in.size(); ++i) { - auto vari = scope_->FindVar(var_vec_in[i]); + DLOG << var_vec_in[i]; + auto vari = this->scope_->FindVar("input"); if (vari->IsInitialized()) { - Tensor *tensor = vari->template GetMutable(); + const Tensor *tensor = vari->template Get(); if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; } } @@ -76,7 +77,7 @@ void OperatorBase::Run() { for (int i = 0; i < var_vec_out.size(); ++i) { auto vari = scope_->FindVar(var_vec_out[i]); if (vari->IsInitialized()) { - Tensor *tensor = vari->template GetMutable(); + const Tensor *tensor = vari->template Get(); if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor; } } @@ -97,10 +98,10 @@ void OperatorBase::Run() { auto vari = scope_->FindVar(var_vec_in[i]); if (vari->IsInitialized()) { if (type_ == "feed") { - Tensor *tensor = vari->template GetMutable(); + const Tensor *tensor = vari->template Get(); if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; } else { - CLImage *cl_image = vari->template GetMutable(); + const CLImage *cl_image = vari->template Get(); if (cl_image) { DLOG << type_ << " input- " << key << "=" << *cl_image; } @@ -114,12 +115,12 @@ void OperatorBase::Run() { auto vari = scope_->FindVar(var_vec_out[i]); if (vari->IsInitialized()) { if (type_ == "fetch") { - Tensor *tensor = vari->template GetMutable(); + const Tensor *tensor = vari->template Get(); if (tensor) { DLOG << type_ << " output- " << key << "=" << *tensor; } } else { - CLImage *cl_image = vari->template GetMutable(); + const CLImage *cl_image = vari->template Get(); if (cl_image) { DLOG << type_ << " output- " << key << "=" << *cl_image; } diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc index 74ae2ef9ee..dd3b1b7317 100644 --- a/src/io/api_paddle_mobile.cc +++ b/src/io/api_paddle_mobile.cc @@ -14,6 +14,7 @@ #include "io/api_paddle_mobile.h" #include +#include "common/enforce.h" #include "framework/tensor.h" namespace paddle_mobile { diff --git a/src/io/api_paddle_mobile.h b/src/io/api_paddle_mobile.h index a552d9152d..bca169a2ed 100644 --- a/src/io/api_paddle_mobile.h +++ b/src/io/api_paddle_mobile.h @@ -12,19 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -/* - * This file contains the implementation of inference API with Anakin engine - * embeded, this API can only support Anakin models. - */ - #pragma once #include -#include "io/paddle_inference_api.h" - -// from paddle_mobile -#include "common/enforce.h" #include "common/types.h" +#include "io/paddle_inference_api.h" #include "io/paddle_mobile.h" namespace paddle_mobile { diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h index 5326f864a4..afbd93dede 100644 --- a/src/io/paddle_inference_api.h +++ b/src/io/paddle_inference_api.h @@ -104,6 +104,8 @@ class PaddlePredictor { // The common configs for all the predictors. struct Config { std::string model_dir; // path to the model directory. + std::string prog_file; + std::string param_file; }; protected: @@ -128,9 +130,8 @@ struct PaddleMobileConfig : public PaddlePredictor::Config { int batch_size = 1; bool optimize = true; bool quantification = false; + bool lod_mode = false; int thread_num = 1; - std::string prog_file; - std::string param_file; std::string cl_path; struct PaddleModelMemoryPack memory_pack; }; diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index bd27c219ea..9c3557da27 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -15,6 +15,9 @@ limitations under the License. */ #include "io/paddle_mobile.h" #include #include "common/common.h" +#ifdef _OPENMP +#include +#endif // _OPENMP #ifdef PADDLE_MOBILE_CL #include #include "framework/cl/cl_tensor.h" @@ -33,7 +36,7 @@ void PaddleMobile::SetThreadNum(int num) { template PMStatus PaddleMobile::Load(const std::string &dirname, bool optimize, bool quantification, - int batch_size, bool loddable) { + int batch_size, bool lod_mode) { if (loader_.get() == nullptr) { loader_ = std::make_shared>(); } else { @@ -43,7 +46,7 @@ PMStatus PaddleMobile::Load(const std::string &dirname, if (executor_.get() == nullptr) { executor_ = std::make_shared>( loader_->Load(dirname, optimize, quantification), batch_size, optimize, - loddable); + lod_mode); } else { LOG(kLOG_INFO) << "executor inited"; } @@ -55,7 +58,7 @@ template PMStatus PaddleMobile::Load(const std::string &model_path, const std::string ¶_path, bool optimize, bool quantification, - int batch_size, bool loddable) { + int batch_size, bool lod_mode) { if (loader_.get() == nullptr) { loader_ = std::make_shared>(); } else { @@ -65,7 +68,7 @@ PMStatus PaddleMobile::Load(const std::string &model_path, if (executor_.get() == nullptr) { executor_ = std::make_shared>( loader_->Load(model_path, para_path, optimize, quantification), - batch_size, optimize, loddable); + batch_size, optimize, lod_mode); } else { LOG(kLOG_INFO) << "executor inited"; } @@ -73,6 +76,21 @@ PMStatus PaddleMobile::Load(const std::string &model_path, return PMSuccess; } +template +PMStatus PaddleMobile::Load(const PaddleMobileConfig &config) { + if (!config.model_dir.empty()) { + return this->Load(config.model_dir, config.optimize, config.quantification, + config.batch_size, config.lod_mode); + } else if (!config.prog_file.empty() && !config.param_file.empty()) { + return this->Load(config.prog_file, config.param_file, config.optimize, + config.quantification, config.batch_size, + config.lod_mode); + } else { + LOG(kLOG_ERROR) << "Failed to load inference model"; + return PMNotInitialized; + } +} + template bool PaddleMobile::LoadCombinedMemory(size_t model_len, const uint8_t *model_buf, diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h index 2bbdccc95e..1651da7c22 100644 --- a/src/io/paddle_mobile.h +++ b/src/io/paddle_mobile.h @@ -18,15 +18,12 @@ limitations under the License. */ #include #include #include -#ifdef _OPENMP -#include -#endif // _OPENMP - #include "common/types.h" #include "framework/executor.h" #include "framework/load_ops.h" #include "framework/loader.h" #include "framework/tensor.h" +#include "io/paddle_inference_api.h" #ifdef PADDLE_MOBILE_CL #include "framework/cl/cl_engine.h" #endif @@ -46,10 +43,12 @@ class PaddleMobile { PMStatus Load(const std::string &dirname, const bool optimize = false, const bool quantification = false, const int batch_size = 1, - const bool lod = false); + const bool lod_mode = false); PMStatus Load(const std::string &model_path, const std::string ¶_path, const bool optimize = false, const bool quantification = false, - const int batch_size = 1, const bool lod = false); + const int batch_size = 1, const bool lod_mode = false); + + PMStatus Load(const PaddleMobileConfig &config); PMStatus Predict(const framework::Tensor &input); PMStatus Predict(const framework::LoDTensor &input); diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp index dc18c983bb..180a209690 100644 --- a/src/operators/kernel/arm/conv_kernel.cpp +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -24,8 +24,12 @@ template <> bool ConvKernel::Init(ConvParam *param) { bool conv3x3 = param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Filter()->dims()[2] == 3; + bool conv5x5 = param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Filter()->dims()[2] == 5; bool depth3x3 = conv3x3 && param->Groups() == param->Input()->dims()[1] && param->Input()->dims()[1] == param->Output()->dims()[1]; + bool depth5x5 = conv5x5 && param->Groups() == param->Input()->dims()[1] && + param->Input()->dims()[1] == param->Output()->dims()[1]; if (param->Filter()->type() == typeid(int8_t)) { if (depth3x3 && param->Strides()[0] < 3 && param->Strides()[0] == param->Strides()[1]) { @@ -46,6 +50,9 @@ bool ConvKernel::Init(ConvParam *param) { param->Strides()[0] == 2 && param->Paddings()[0] == 1 && param->Paddings()[0] == param->Paddings()[1]) { param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S2P1_FLOAT; + } else if (depth5x5 && param->Strides()[0] == param->Strides()[1] && + param->Strides()[0] == 1) { + param->ExecMode() = ConvParam::EXEC_DEPTHWISE5x5S1_FLOAT; #ifndef __aarch64__ } else if (conv3x3 && param->Strides()[0] == param->Strides()[1] && param->Dilations()[0] == param->Dilations()[1] && @@ -87,6 +94,10 @@ void ConvKernel::Compute(const ConvParam ¶m) { math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), nullptr, false); break; + case ConvParam::EXEC_DEPTHWISE5x5S1_FLOAT: + math::DepthwiseConv5x5S1(*param.Input(), *param.Filter(), + param.Paddings(), param.Output()); + break; case ConvParam::EXEC_WINOGRAD3X3_FLOAT: WinogradConv3x3<8, 3>(param); break; diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index 848f1b113b..0a21545e1c 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "operators/math/conv_func.h" #include "operators/math/depthwise_conv3x3.h" +#include "operators/math/depthwise_conv5x5.h" #include "operators/math/im2col.h" #include "operators/math/math_function.h" #include "operators/math/pad.h" diff --git a/src/operators/math/depthwise_conv5x5.cpp b/src/operators/math/depthwise_conv5x5.cpp new file mode 100644 index 0000000000..f7dc24024f --- /dev/null +++ b/src/operators/math/depthwise_conv5x5.cpp @@ -0,0 +1,740 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if defined(__ARM_NEON__) && !defined(__aarch64__) + +#include "operators/math/depthwise_conv5x5.h" +#include +#include + +namespace paddle_mobile { +namespace operators { +namespace math { + +#ifndef __aarch64__ +inline float32x4_t vpaddq_f32(float32x4_t r0, float32x4_t r1) { + float32x2_t sum0 = vpadd_f32(vget_low_f32(r0), vget_high_f32(r0)); + float32x2_t sum1 = vpadd_f32(vget_low_f32(r1), vget_high_f32(r1)); + return vcombine_f32(sum0, sum1); +} +#endif + +template +inline void Depth5x5NormalRowLoadInput(const float *input, float32x4_t *y) { + y[0] = vld1q_f32(input); + y[4] = vld1q_f32(input + 4); + y[1] = vextq_f32(y[0], y[4], 1); + y[2] = vextq_f32(y[0], y[4], 2); + y[3] = vextq_f32(y[0], y[4], 3); +} + +template <> +inline void Depth5x5NormalRowLoadInput<2>(const float *input, float32x4_t *y) { + float32x4x2_t x = vld2q_f32(input); + y[0] = x.val[0]; + y[1] = x.val[1]; + y[2] = vextq_f32(y[0], y[0], 1); + y[3] = vextq_f32(y[1], y[1], 1); + y[4] = vextq_f32(y[0], y[0], 2); +} + +#define DEPTHWISE_CONV_NORMAL_BORDER(start, end) \ + for (int w = start; w < end; ++w) { \ + const int w_in_start = -padding_w + w * Stride_w; \ + const int w_in_end = w_in_start + 5; \ + const int w_start = w_in_start > 0 ? w_in_start : 0; \ + const int w_end = w_in_end < input_w ? w_in_end : input_w; \ + float value = 0; \ + for (int h_in = h_start; h_in < h_end; ++h_in) { \ + for (int w_in = w_start; w_in < w_end; ++w_in) { \ + value += filter[(h_in - h_in_start) * 5 + (w_in - w_in_start)] * \ + input[h_in * input_w + w_in]; \ + } \ + } \ + output_ptr[w] = value; \ + } + +template +inline void DepthwiseConv5x5NormalRow(const float *input, const float *filter, + const int h_output, const int input_h, + const int input_w, const int padding_h, + const int padding_w, const int output_w, + float *output, float32x4_t *ker, + float32_t *ker1) { + const int h_in_start = -padding_h + h_output * Stride_h; + const int h_in_end = h_in_start + 5; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int h_end = h_in_end < input_h ? h_in_end : input_h; + + int valid_w_start = (padding_w + Stride_w - 1) / Stride_w; + int valid_w_end = output_w - valid_w_start; + + float *output_ptr = output + h_output * output_w; + // border left + DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start) + // middle + int output_tiles = (valid_w_end - valid_w_start) >> 2; + float32x4_t _sum, _x[5]; + // valid w + for (int w = 0; w < output_tiles * 4; w += 4) { + _sum = vdupq_n_f32(0.f); + int output_offset = valid_w_start + w; + int input_w_offset = output_offset * Stride_w - padding_w; + for (int h_in = h_start; h_in < h_end; ++h_in) { + int index = h_in - h_in_start; + Depth5x5NormalRowLoadInput( + input + h_in * input_w + input_w_offset, _x); + _sum = vmlaq_n_f32(_sum, _x[0], ker1[index]); + _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 0); + _sum = vmlaq_lane_f32(_sum, _x[2], vget_low_f32(ker[index]), 1); + _sum = vmlaq_lane_f32(_sum, _x[3], vget_high_f32(ker[index]), 0); + _sum = vmlaq_lane_f32(_sum, _x[4], vget_high_f32(ker[index]), 1); + } + vst1q_f32(output_ptr + output_offset, _sum); + } + // remain valid w + int remain = (valid_w_end - valid_w_start) & 0x3; + if (remain > 0) { + _sum = vdupq_n_f32(0.f); + int remain_start = valid_w_start + (output_tiles << 2); + int input_w_offset = remain_start * Stride_w - padding_w; + for (int h_in = h_start; h_in < h_end; ++h_in) { + int index = h_in - h_in_start; + Depth5x5NormalRowLoadInput( + input + h_in * input_w + input_w_offset, _x); + _sum = vmlaq_n_f32(_sum, _x[0], ker1[index]); + _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 0); + _sum = vmlaq_lane_f32(_sum, _x[2], vget_low_f32(ker[index]), 1); + _sum = vmlaq_lane_f32(_sum, _x[3], vget_high_f32(ker[index]), 0); + _sum = vmlaq_lane_f32(_sum, _x[4], vget_high_f32(ker[index]), 1); + } + switch (remain) { + case 1: + vst1_lane_f32(output_ptr + remain_start, vget_low_f32(_sum), 0); + break; + case 2: + vst1_f32(output_ptr + remain_start, vget_low_f32(_sum)); + break; + case 3: + vst1_f32(output_ptr + remain_start, vget_low_f32(_sum)); + vst1_lane_f32(output_ptr + remain_start + 2, vget_high_f32(_sum), 0); + break; + } + } + // border right + DEPTHWISE_CONV_NORMAL_BORDER(valid_w_end, output_w) +} + +template <> +void DepthwiseConv5x5S1(const framework::Tensor &input, + const framework::Tensor &filter, + const std::vector &paddings, + framework::Tensor *output) { + const float *input_data = input.data(); + const float *filter_data = filter.data(); + float *out_data = output->mutable_data(); + int input_h = input.dims()[2]; + int input_w = input.dims()[3]; + int output_h = output->dims()[2]; + int output_w = output->dims()[3]; + int padding_h = paddings[0]; + int padding_w = paddings[1]; + int image_size = input_h * input_w; + int out_image_size = output_h * output_w; + int valid_h_start = padding_h; + int valid_h_end = output_h - valid_h_start; + int valid_h = valid_h_end - valid_h_start; + int valid_w_start = padding_w; + int valid_w_end = output_w - valid_w_start; + int valid_w = valid_w_end - valid_w_start; + DLOG << "valid_h_start: " << valid_h_start; + DLOG << "valid_h_end: " << valid_h_end; + DLOG << "valid_w_start: " << valid_w_start; + DLOG << "valid_w_end: " << valid_w_end; + + for (int g = 0; g < input.dims()[1]; ++g) { + const float *input_ptr = input_data + g * image_size; + const float *filter_ptr = filter_data + g * 25; + float *output_ptr = out_data + g * out_image_size; + + const float *filter_ptr0 = filter_ptr; + const float *filter_ptr1 = filter_ptr0 + 5; + const float *filter_ptr2 = filter_ptr1 + 5; + const float *filter_ptr3 = filter_ptr2 + 5; + const float *filter_ptr4 = filter_ptr3 + 5; + float32x4_t _ker[7]; + float32_t _ker1[5] = {*filter_ptr0, *filter_ptr1, *filter_ptr2, + *filter_ptr3, *filter_ptr4}; + _ker[0] = vld1q_f32(filter_ptr0 + 1); + _ker[1] = vld1q_f32(filter_ptr1 + 1); + _ker[2] = vld1q_f32(filter_ptr2 + 1); + _ker[3] = vld1q_f32(filter_ptr3 + 1); + _ker[4] = vld1q_f32(filter_ptr4 + 1); + _ker[5] = vld1q_f32(_ker1); + _ker[6] = vld1q_f32(_ker1 + 4); + + // pad top + for (int h = 0; h < valid_h_start; ++h) { + DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, + input_w, padding_h, padding_w, output_w, + output_ptr, _ker, _ker1); + } + + // output 4x4 + int output_w_tiles = valid_w / 4; + int output_w_remain = valid_w - output_w_tiles * 4; + for (int h = valid_h_start; h < valid_h_end - 1; h += 2) { + const float *input_ptr0 = input_ptr + (h - padding_h) * input_w; + const float *input_ptr1 = input_ptr0 + input_w; + const float *input_ptr2 = input_ptr1 + input_w; + const float *input_ptr3 = input_ptr2 + input_w; + const float *input_ptr4 = input_ptr3 + input_w; + const float *input_ptr5 = input_ptr4 + input_w; + float *output_ptr0 = output_ptr + h * output_w; + float *output_ptr1 = output_ptr0 + output_w; + // pad left + if (padding_w) { + float32x4_t row0 = vld1q_f32(input_ptr0); + float32x4_t row1 = vld1q_f32(input_ptr1); + float32x4_t row2 = vld1q_f32(input_ptr2); + float32x4_t row3 = vld1q_f32(input_ptr3); + float32x4_t row4 = vld1q_f32(input_ptr4); + float32x4_t row5 = vld1q_f32(input_ptr5); + float32x4_t zero = vdupq_n_f32(0.f); + for (int w = valid_w_start - 1; w >= 0; --w) { + int padding = padding_w - w; + if (padding >= 5) { + output_ptr0[w] = 0.f; + output_ptr1[w] = 0.f; + } else { + row0 = vmulq_f32(row0, _ker[0]); + row0 = vmlaq_f32(row0, row1, _ker[1]); + row0 = vmlaq_f32(row0, row2, _ker[2]); + row0 = vmlaq_f32(row0, row3, _ker[3]); + row0 = vmlaq_f32(row0, row4, _ker[4]); + row1 = vmulq_f32(row1, _ker[0]); + row1 = vmlaq_f32(row1, row2, _ker[1]); + row1 = vmlaq_f32(row1, row3, _ker[2]); + row1 = vmlaq_f32(row1, row4, _ker[3]); + row1 = vmlaq_f32(row1, row5, _ker[4]); + row0 = vpaddq_f32(row0, row1); + float32x2_t sum = + vpadd_f32(vget_low_f32(row0), vget_high_f32(row0)); + vst1_lane_f32(output_ptr0 + w, sum, 0); + vst1_lane_f32(output_ptr1 + w, sum, 1); + + row0 = vextq_f32(zero, row0, 3); + row1 = vextq_f32(zero, row1, 3); + row2 = vextq_f32(zero, row2, 3); + row3 = vextq_f32(zero, row3, 3); + row4 = vextq_f32(zero, row4, 3); + row5 = vextq_f32(zero, row5, 3); + } + } + output_ptr0 += valid_w_start; + output_ptr1 += valid_w_start; + } + // valid + int loop = output_w_tiles; + asm volatile( + "cmp %[loop], #0 \n" + "ble start_remain_%= \n" + "mov r0, #16 \n" + "loop_2h4w_%=: \n" + "vld1.32 {d14-d17}, [%[input_ptr0]], r0 \n" + "vld1.32 {d18-d21}, [%[input_ptr1]], r0 \n" + "vld1.32 {d22-d25}, [%[input_ptr2]], r0 \n" + "vmul.f32 q14, q7, %e[ker0][0] \n" + "vext.32 q13, q7, q8, #1 \n" + "vmla.f32 q14, q13, %e[kr0][0] \n" + "vext.32 q13, q7, q8, #2 \n" + "vmla.f32 q14, q13, %e[kr0][1] \n" + "vext.32 q13, q7, q8, #3 \n" + "vmla.f32 q14, q13, %f[kr0][0] \n" + "vmla.f32 q14, q8, %f[kr0][1] \n" + + "vmla.f32 q14, q9, %e[ker0][1] \n" + "vmul.f32 q15, q9, %e[ker0][0] \n" + "vext.32 q13, q9, q10, #1 \n" + "vmla.f32 q14, q13, %e[kr1][0] \n" + "vmla.f32 q15, q13, %e[kr0][0] \n" + "vext.32 q13, q9, q10, #2 \n" + "vmla.f32 q14, q13, %e[kr1][1] \n" + "vmla.f32 q15, q13, %e[kr0][1] \n" + "vext.32 q13, q9, q10, #3 \n" + "vmla.f32 q14, q13, %f[kr1][0] \n" + "vmla.f32 q15, q13, %f[kr0][0] \n" + "vmla.f32 q14, q10, %f[kr1][1] \n" + "vmla.f32 q15, q10, %f[kr0][1] \n" + + "vmla.f32 q14, q11, %f[ker0][0] \n" + "vmla.f32 q15, q11, %e[ker0][1] \n" + "vext.32 q13, q11, q12, #1 \n" + "vmla.f32 q14, q13, %e[kr2][0] \n" + "vmla.f32 q15, q13, %e[kr1][0] \n" + "vext.32 q13, q11, q12, #2 \n" + "vmla.f32 q14, q13, %e[kr2][1] \n" + "vmla.f32 q15, q13, %e[kr1][1] \n" + "vext.32 q13, q11, q12, #3 \n" + "vmla.f32 q14, q13, %f[kr2][0] \n" + "vmla.f32 q15, q13, %f[kr1][0] \n" + "vmla.f32 q14, q12, %f[kr2][1] \n" + "vmla.f32 q15, q12, %f[kr1][1] \n" + + "vld1.32 {d14-d17}, [%[input_ptr3]], r0 \n" + "vld1.32 {d18-d21}, [%[input_ptr4]], r0 \n" + "vld1.32 {d22-d25}, [%[input_ptr5]], r0 \n" + "vmla.f32 q14, q7, %f[ker0][1] \n" + "vmla.f32 q15, q7, %f[ker0][0] \n" + "vext.32 q13, q7, q8, #1 \n" + "vmla.f32 q14, q13, %e[kr3][0] \n" + "vmla.f32 q15, q13, %e[kr2][0] \n" + "vext.32 q13, q7, q8, #2 \n" + "vmla.f32 q14, q13, %e[kr3][1] \n" + "vmla.f32 q15, q13, %e[kr2][1] \n" + "vext.32 q13, q7, q8, #3 \n" + "vmla.f32 q14, q13, %f[kr3][0] \n" + "vmla.f32 q15, q13, %f[kr2][0] \n" + "vmla.f32 q14, q8, %f[kr3][1] \n" + "vmla.f32 q15, q8, %f[kr2][1] \n" + + "vmla.f32 q14, q9, %e[ker1][0] \n" + "vmla.f32 q15, q9, %f[ker0][1] \n" + "vext.32 q13, q9, q10, #1 \n" + "vmla.f32 q14, q13, %e[kr4][0] \n" + "vmla.f32 q15, q13, %e[kr3][0] \n" + "vext.32 q13, q9, q10, #2 \n" + "vmla.f32 q14, q13, %e[kr4][1] \n" + "vmla.f32 q15, q13, %e[kr3][1] \n" + "vext.32 q13, q9, q10, #3 \n" + "vmla.f32 q14, q13, %f[kr4][0] \n" + "vmla.f32 q15, q13, %f[kr3][0] \n" + "vmla.f32 q14, q10, %f[kr4][1] \n" + "vmla.f32 q15, q10, %f[kr3][1] \n" + + "vmla.f32 q15, q11, %e[ker1][0] \n" + "vext.32 q13, q11, q12, #1 \n" + "vmla.f32 q15, q13, %e[kr4][0] \n" + "vext.32 q13, q11, q12, #2 \n" + "vmla.f32 q15, q13, %e[kr4][1] \n" + "vext.32 q13, q11, q12, #3 \n" + "vmla.f32 q15, q13, %f[kr4][0] \n" + "vmla.f32 q15, q12, %f[kr4][1] \n" + // restore output + "vst1.32 {q14}, [%[output_ptr0]]! \n" + "vst1.32 {q15}, [%[output_ptr1]]! \n" + "subs %[loop], #1 \n" + "bne loop_2h4w_%= \n" + + "start_remain_%=: \n" + "cmp %[remain], #0 \n" + "ble end_%= \n" + "mov r0, %[remain], lsl #2 \n" + "vld1.32 {d14-d17}, [%[input_ptr0]], r0 \n" + "vld1.32 {d18-d21}, [%[input_ptr1]], r0 \n" + "vld1.32 {d22-d25}, [%[input_ptr2]], r0 \n" + "vmul.f32 q14, q7, %e[ker0][0] \n" + "vext.32 q13, q7, q8, #1 \n" + "vmla.f32 q14, q13, %e[kr0][0] \n" + "vext.32 q13, q7, q8, #2 \n" + "vmla.f32 q14, q13, %e[kr0][1] \n" + "vext.32 q13, q7, q8, #3 \n" + "vmla.f32 q14, q13, %f[kr0][0] \n" + "vmla.f32 q14, q8, %f[kr0][1] \n" + + "vmla.f32 q14, q9, %e[ker0][1] \n" + "vmul.f32 q15, q9, %e[ker0][0] \n" + "vext.32 q13, q9, q10, #1 \n" + "vmla.f32 q14, q13, %e[kr1][0] \n" + "vmla.f32 q15, q13, %e[kr0][0] \n" + "vext.32 q13, q9, q10, #2 \n" + "vmla.f32 q14, q13, %e[kr1][1] \n" + "vmla.f32 q15, q13, %e[kr0][1] \n" + "vext.32 q13, q9, q10, #3 \n" + "vmla.f32 q14, q13, %f[kr1][0] \n" + "vmla.f32 q15, q13, %f[kr0][0] \n" + "vmla.f32 q14, q10, %f[kr1][1] \n" + "vmla.f32 q15, q10, %f[kr0][1] \n" + + "vmla.f32 q14, q11, %f[ker0][0] \n" + "vmla.f32 q15, q11, %e[ker0][1] \n" + "vext.32 q13, q11, q12, #1 \n" + "vmla.f32 q14, q13, %e[kr2][0] \n" + "vmla.f32 q15, q13, %e[kr1][0] \n" + "vext.32 q13, q11, q12, #2 \n" + "vmla.f32 q14, q13, %e[kr2][1] \n" + "vmla.f32 q15, q13, %e[kr1][1] \n" + "vext.32 q13, q11, q12, #3 \n" + "vmla.f32 q14, q13, %f[kr2][0] \n" + "vmla.f32 q15, q13, %f[kr1][0] \n" + "vmla.f32 q14, q12, %f[kr2][1] \n" + "vmla.f32 q15, q12, %f[kr1][1] \n" + + "vld1.32 {d14-d17}, [%[input_ptr3]], r0 \n" + "vld1.32 {d18-d21}, [%[input_ptr4]], r0 \n" + "vld1.32 {d22-d25}, [%[input_ptr5]], r0 \n" + "vmla.f32 q14, q7, %f[ker0][1] \n" + "vmla.f32 q15, q7, %f[ker0][0] \n" + "vext.32 q13, q7, q8, #1 \n" + "vmla.f32 q14, q13, %e[kr3][0] \n" + "vmla.f32 q15, q13, %e[kr2][0] \n" + "vext.32 q13, q7, q8, #2 \n" + "vmla.f32 q14, q13, %e[kr3][1] \n" + "vmla.f32 q15, q13, %e[kr2][1] \n" + "vext.32 q13, q7, q8, #3 \n" + "vmla.f32 q14, q13, %f[kr3][0] \n" + "vmla.f32 q15, q13, %f[kr2][0] \n" + "vmla.f32 q14, q8, %f[kr3][1] \n" + "vmla.f32 q15, q8, %f[kr2][1] \n" + + "vmla.f32 q14, q9, %e[ker1][0] \n" + "vmla.f32 q15, q9, %f[ker0][1] \n" + "vext.32 q13, q9, q10, #1 \n" + "vmla.f32 q14, q13, %e[kr4][0] \n" + "vmla.f32 q15, q13, %e[kr3][0] \n" + "vext.32 q13, q9, q10, #2 \n" + "vmla.f32 q14, q13, %e[kr4][1] \n" + "vmla.f32 q15, q13, %e[kr3][1] \n" + "vext.32 q13, q9, q10, #3 \n" + "vmla.f32 q14, q13, %f[kr4][0] \n" + "vmla.f32 q15, q13, %f[kr3][0] \n" + "vmla.f32 q14, q10, %f[kr4][1] \n" + "vmla.f32 q15, q10, %f[kr3][1] \n" + + "vmla.f32 q15, q11, %e[ker1][0] \n" + "vext.32 q13, q11, q12, #1 \n" + "vmla.f32 q15, q13, %e[kr4][0] \n" + "vext.32 q13, q11, q12, #2 \n" + "vmla.f32 q15, q13, %e[kr4][1] \n" + "vext.32 q13, q11, q12, #3 \n" + "vmla.f32 q15, q13, %f[kr4][0] \n" + "vmla.f32 q15, q12, %f[kr4][1] \n" + + "cmp %[remain], #2 \n" + "blt store_2h1w_%= \n" + "vst1.32 {d28}, [%[output_ptr0]]! \n" + "vst1.32 {d30}, [%[output_ptr1]]! \n" + "cmp %[remain], #3 \n" + "blt end_%= \n" + "vst1.32 {d29[0]}, [%[output_ptr0]]! \n" + "vst1.32 {d31[0]}, [%[output_ptr1]]! \n" + "b end_%= \n" + + "store_2h1w_%=: \n" + "vst1.32 {d28[0]}, [%[output_ptr0]]! \n" + "vst1.32 {d30[0]}, [%[output_ptr1]]! \n" + "end_%=: \n" + : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), + [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), + [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5), + [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), + [loop] "+r"(loop) + : [remain] "r"(output_w_remain), [kr0] "w"(_ker[0]), + [kr1] "w"(_ker[1]), [kr2] "w"(_ker[2]), [kr3] "w"(_ker[3]), + [kr4] "w"(_ker[4]), [ker0] "w"(_ker[5]), [ker1] "w"(_ker[6]) + : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "r0"); + // pad right + if (padding_w) { + float32x4_t row0 = vld1q_f32(input_ptr0); + float32x4_t row1 = vld1q_f32(input_ptr1); + float32x4_t row2 = vld1q_f32(input_ptr2); + float32x4_t row3 = vld1q_f32(input_ptr3); + float32x4_t row4 = vld1q_f32(input_ptr4); + float32x4_t row5 = vld1q_f32(input_ptr5); + float32x4_t zero = vdupq_n_f32(0.f); + for (int w = valid_w_end; w < output_w; ++w) { + int padding = w + 5 - (padding_w + input_w); + if (padding >= 5) { + *output_ptr0 = 0.f; + *output_ptr1 = 0.f; + } else { + int iw = w - valid_w_end; + float sum0 = input_ptr0[iw] * filter_ptr0[0] + + input_ptr1[iw] * filter_ptr1[0] + + input_ptr2[iw] * filter_ptr2[0] + + input_ptr3[iw] * filter_ptr3[0] + + input_ptr4[iw] * filter_ptr4[0]; + float sum1 = input_ptr1[iw] * filter_ptr0[0] + + input_ptr2[iw] * filter_ptr1[0] + + input_ptr3[iw] * filter_ptr2[0] + + input_ptr4[iw] * filter_ptr3[0] + + input_ptr5[iw] * filter_ptr4[0]; + row0 = vextq_f32(row0, zero, 1); + row1 = vextq_f32(row1, zero, 1); + row2 = vextq_f32(row2, zero, 1); + row3 = vextq_f32(row3, zero, 1); + row4 = vextq_f32(row4, zero, 1); + row5 = vextq_f32(row5, zero, 1); + row0 = vmulq_f32(row0, _ker[0]); + row0 = vmlaq_f32(row0, row1, _ker[1]); + row0 = vmlaq_f32(row0, row2, _ker[2]); + row0 = vmlaq_f32(row0, row3, _ker[3]); + row0 = vmlaq_f32(row0, row4, _ker[4]); + row1 = vmulq_f32(row1, _ker[0]); + row1 = vmlaq_f32(row1, row2, _ker[1]); + row1 = vmlaq_f32(row1, row3, _ker[2]); + row1 = vmlaq_f32(row1, row4, _ker[3]); + row1 = vmlaq_f32(row1, row5, _ker[4]); + row0 = vpaddq_f32(row0, row1); + float32x2_t sum = + vpadd_f32(vget_low_f32(row0), vget_high_f32(row0)); + sum0 += vget_lane_f32(sum, 0); + sum1 += vget_lane_f32(sum, 1); + *output_ptr0 = sum0; + *output_ptr1 = sum1; + } + output_ptr0++; + output_ptr1++; + } + } + } + // remain height + int start_h = valid_h_start + (valid_h & 0xfffe); + if (start_h < valid_h_end) { + const float *input_ptr0 = input_ptr + (start_h - padding_h) * input_w; + const float *input_ptr1 = input_ptr0 + input_w; + const float *input_ptr2 = input_ptr1 + input_w; + const float *input_ptr3 = input_ptr2 + input_w; + const float *input_ptr4 = input_ptr3 + input_w; + float *output_ptr0 = output_ptr + start_h * output_w; + // pad left + if (padding_w) { + float32x4_t row0 = vld1q_f32(input_ptr0); + float32x4_t row1 = vld1q_f32(input_ptr1); + float32x4_t row2 = vld1q_f32(input_ptr2); + float32x4_t row3 = vld1q_f32(input_ptr3); + float32x4_t row4 = vld1q_f32(input_ptr4); + float32x4_t zero = vdupq_n_f32(0.f); + for (int w = valid_w_start - 1; w >= 0; --w) { + int padding = padding_w - w; + if (padding >= 5) { + output_ptr0[w] = 0.f; + } else { + row0 = vmulq_f32(row0, _ker[0]); + row0 = vmlaq_f32(row0, row1, _ker[1]); + row0 = vmlaq_f32(row0, row2, _ker[2]); + row0 = vmlaq_f32(row0, row3, _ker[3]); + row0 = vmlaq_f32(row0, row4, _ker[4]); + float32x2_t sum = + vpadd_f32(vget_low_f32(row0), vget_high_f32(row0)); + sum = vpadd_f32(sum, sum); + vst1_lane_f32(output_ptr0 + w, sum, 0); + + row0 = vextq_f32(zero, row0, 3); + row1 = vextq_f32(zero, row1, 3); + row2 = vextq_f32(zero, row2, 3); + row3 = vextq_f32(zero, row3, 3); + row4 = vextq_f32(zero, row4, 3); + } + } + output_ptr0 += valid_w_start; + } + // valid + int loop = output_w_tiles; + asm volatile( + "cmp %[loop], #0 \n" + "ble start_remain_%= \n" + "mov r0, #16 \n" + "loop_1h4w_%=: \n" + "vld1.32 {d14-d17}, [%[input_ptr0]], r0 \n" + "vld1.32 {d18-d21}, [%[input_ptr1]], r0 \n" + "vld1.32 {d22-d25}, [%[input_ptr2]], r0 \n" + "vmul.f32 q14, q7, %e[ker0][0] \n" + "vext.32 q13, q7, q8, #1 \n" + "vmla.f32 q14, q13, %e[kr0][0] \n" + "vext.32 q13, q7, q8, #2 \n" + "vmla.f32 q14, q13, %e[kr0][1] \n" + "vext.32 q13, q7, q8, #3 \n" + "vmla.f32 q14, q13, %f[kr0][0] \n" + "vmla.f32 q14, q8, %f[kr0][1] \n" + + "vmla.f32 q14, q9, %e[ker0][1] \n" + "vext.32 q13, q9, q10, #1 \n" + "vmla.f32 q14, q13, %e[kr1][0] \n" + "vext.32 q13, q9, q10, #2 \n" + "vmla.f32 q14, q13, %e[kr1][1] \n" + "vext.32 q13, q9, q10, #3 \n" + "vmla.f32 q14, q13, %f[kr1][0] \n" + "vmla.f32 q14, q10, %f[kr1][1] \n" + + "vmla.f32 q14, q11, %f[ker0][0] \n" + "vext.32 q13, q11, q12, #1 \n" + "vmla.f32 q14, q13, %e[kr2][0] \n" + "vext.32 q13, q11, q12, #2 \n" + "vmla.f32 q14, q13, %e[kr2][1] \n" + "vext.32 q13, q11, q12, #3 \n" + "vmla.f32 q14, q13, %f[kr2][0] \n" + "vmla.f32 q14, q12, %f[kr2][1] \n" + + "vld1.32 {d14-d17}, [%[input_ptr3]], r0 \n" + "vld1.32 {d18-d21}, [%[input_ptr4]], r0 \n" + "vmla.f32 q14, q7, %f[ker0][1] \n" + "vext.32 q13, q7, q8, #1 \n" + "vmla.f32 q14, q13, %e[kr3][0] \n" + "vext.32 q13, q7, q8, #2 \n" + "vmla.f32 q14, q13, %e[kr3][1] \n" + "vext.32 q13, q7, q8, #3 \n" + "vmla.f32 q14, q13, %f[kr3][0] \n" + "vmla.f32 q14, q8, %f[kr3][1] \n" + + "vmla.f32 q14, q9, %e[ker1][0] \n" + "vext.32 q13, q9, q10, #1 \n" + "vmla.f32 q14, q13, %e[kr4][0] \n" + "vext.32 q13, q9, q10, #2 \n" + "vmla.f32 q14, q13, %e[kr4][1] \n" + "vext.32 q13, q9, q10, #3 \n" + "vmla.f32 q14, q13, %f[kr4][0] \n" + "vmla.f32 q14, q10, %f[kr4][1] \n" + + // restore output + "vst1.32 {q14}, [%[output_ptr0]]! \n" + "subs %[loop], #1 \n" + "bne loop_1h4w_%= \n" + + "start_remain_%=: \n" + "cmp %[remain], #0 \n" + "ble end_%= \n" + "mov r0, %[remain], lsl #2 \n" + "vld1.32 {d14-d17}, [%[input_ptr0]], r0 \n" + "vld1.32 {d18-d21}, [%[input_ptr1]], r0 \n" + "vld1.32 {d22-d25}, [%[input_ptr2]], r0 \n" + "vmul.f32 q14, q7, %e[ker0][0] \n" + "vext.32 q13, q7, q8, #1 \n" + "vmla.f32 q14, q13, %e[kr0][0] \n" + "vext.32 q13, q7, q8, #2 \n" + "vmla.f32 q14, q13, %e[kr0][1] \n" + "vext.32 q13, q7, q8, #3 \n" + "vmla.f32 q14, q13, %f[kr0][0] \n" + "vmla.f32 q14, q8, %f[kr0][1] \n" + + "vmla.f32 q14, q9, %e[ker0][1] \n" + "vext.32 q13, q9, q10, #1 \n" + "vmla.f32 q14, q13, %e[kr1][0] \n" + "vext.32 q13, q9, q10, #2 \n" + "vmla.f32 q14, q13, %e[kr1][1] \n" + "vext.32 q13, q9, q10, #3 \n" + "vmla.f32 q14, q13, %f[kr1][0] \n" + "vmla.f32 q14, q10, %f[kr1][1] \n" + + "vmla.f32 q14, q11, %f[ker0][0] \n" + "vext.32 q13, q11, q12, #1 \n" + "vmla.f32 q14, q13, %e[kr2][0] \n" + "vext.32 q13, q11, q12, #2 \n" + "vmla.f32 q14, q13, %e[kr2][1] \n" + "vext.32 q13, q11, q12, #3 \n" + "vmla.f32 q14, q13, %f[kr2][0] \n" + "vmla.f32 q14, q12, %f[kr2][1] \n" + + "vld1.32 {d14-d17}, [%[input_ptr3]], r0 \n" + "vld1.32 {d18-d21}, [%[input_ptr4]], r0 \n" + "vmla.f32 q14, q7, %f[ker0][1] \n" + "vext.32 q13, q7, q8, #1 \n" + "vmla.f32 q14, q13, %e[kr3][0] \n" + "vext.32 q13, q7, q8, #2 \n" + "vmla.f32 q14, q13, %e[kr3][1] \n" + "vext.32 q13, q7, q8, #3 \n" + "vmla.f32 q14, q13, %f[kr3][0] \n" + "vmla.f32 q14, q8, %f[kr3][1] \n" + + "vmla.f32 q14, q9, %e[ker1][0] \n" + "vext.32 q13, q9, q10, #1 \n" + "vmla.f32 q14, q13, %e[kr4][0] \n" + "vext.32 q13, q9, q10, #2 \n" + "vmla.f32 q14, q13, %e[kr4][1] \n" + "vext.32 q13, q9, q10, #3 \n" + "vmla.f32 q14, q13, %f[kr4][0] \n" + "vmla.f32 q14, q10, %f[kr4][1] \n" + + "cmp %[remain], #2 \n" + "blt store_1h1w_%= \n" + "vst1.32 {d28}, [%[output_ptr0]]! \n" + "cmp %[remain], #3 \n" + "blt end_%= \n" + "vst1.32 {d29[0]}, [%[output_ptr0]]! \n" + "b end_%= \n" + + "store_1h1w_%=: \n" + "vst1.32 {d28[0]}, [%[output_ptr0]]! \n" + "end_%=: \n" + : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), + [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), + [input_ptr4] "+r"(input_ptr4), [output_ptr0] "+r"(output_ptr0), + [loop] "+r"(loop) + : [remain] "r"(output_w_remain), [kr0] "w"(_ker[0]), + [kr1] "w"(_ker[1]), [kr2] "w"(_ker[2]), [kr3] "w"(_ker[3]), + [kr4] "w"(_ker[4]), [ker0] "w"(_ker[5]), [ker1] "w"(_ker[6]) + : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "r0"); + // pad right + if (padding_w) { + float32x4_t row0 = vld1q_f32(input_ptr0); + float32x4_t row1 = vld1q_f32(input_ptr1); + float32x4_t row2 = vld1q_f32(input_ptr2); + float32x4_t row3 = vld1q_f32(input_ptr3); + float32x4_t row4 = vld1q_f32(input_ptr4); + float32x4_t zero = vdupq_n_f32(0.f); + for (int w = valid_w_end; w < output_w; ++w) { + int padding = w + 5 - (padding_w + input_w); + if (padding >= 5) { + *output_ptr0 = 0.f; + } else { + int iw = w - valid_w_end; + float sum0 = input_ptr0[iw] * filter_ptr0[0] + + input_ptr1[iw] * filter_ptr1[0] + + input_ptr2[iw] * filter_ptr2[0] + + input_ptr3[iw] * filter_ptr3[0] + + input_ptr4[iw] * filter_ptr4[0]; + row0 = vextq_f32(row0, zero, 1); + row1 = vextq_f32(row1, zero, 1); + row2 = vextq_f32(row2, zero, 1); + row3 = vextq_f32(row3, zero, 1); + row4 = vextq_f32(row4, zero, 1); + row0 = vmulq_f32(row0, _ker[0]); + row0 = vmlaq_f32(row0, row1, _ker[1]); + row0 = vmlaq_f32(row0, row2, _ker[2]); + row0 = vmlaq_f32(row0, row3, _ker[3]); + row0 = vmlaq_f32(row0, row4, _ker[4]); + float32x2_t sum = + vpadd_f32(vget_low_f32(row0), vget_high_f32(row0)); + sum = vpadd_f32(sum, sum); + sum0 += vget_lane_f32(sum, 0); + *output_ptr0 = sum0; + } + output_ptr0++; + } + } + } + // pad bottom + for (int h = valid_h_end; h < output_h; ++h) { + DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, + input_w, padding_h, padding_w, output_w, + output_ptr, _ker, _ker1); + } + } +} + +template <> +void DepthwiseConv5x5S2(const framework::Tensor &input, + const framework::Tensor &filter, + const std::vector &paddings, + framework::Tensor *output) {} + +} // namespace math +} // namespace operators +} // namespace paddle_mobile + +#endif // __ARM_NEON__ diff --git a/src/operators/math/depthwise_conv5x5.h b/src/operators/math/depthwise_conv5x5.h new file mode 100644 index 0000000000..d047bbfa1a --- /dev/null +++ b/src/operators/math/depthwise_conv5x5.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "framework/tensor.h" +#include "operators/math/conv_func.h" + +namespace paddle_mobile { +namespace operators { +namespace math { + +// TODO(hjchen2) need to be implemented +// template +// void DepthwiseConv5x5(const framework::Tensor *input, +// const framework::Tensor *filter, +// const std::vector &strides, +// const std::vector &paddings, +// framework::Tensor *output); + +template +void DepthwiseConv5x5S1(const framework::Tensor &input, + const framework::Tensor &filter, + const std::vector &paddings, + framework::Tensor *output); + +template +void DepthwiseConv5x5S2(const framework::Tensor &input, + const framework::Tensor &filter, + const std::vector &paddings, + framework::Tensor *output); + +} // namespace math +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 643bd65ee0..2d8d7077d1 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -424,6 +424,8 @@ class ConvParam : public OpParam { EXEC_DEPTHWISE3x3_FLOAT, EXEC_WINOGRAD3X3_FLOAT, EXEC_WINOGRAD5X5_FLOAT, + EXEC_DEPTHWISE5x5S1_FLOAT, + EXEC_DEPTHWISE5x5S2_FLOAT, EXEC_GEMM_INT8, EXEC_DEPTHWISE3x3_INT8, }; @@ -2598,8 +2600,8 @@ class QuantizeParam : public OpParam { // if offine scale or not bool offline_ = false; // round method type - // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO; - RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO; + RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO; + // RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO; }; #endif -- GitLab