From 5802b1f8d58c0bd9dcc9257edf3601d8ba9559b6 Mon Sep 17 00:00:00 2001 From: yangruoqi713 Date: Wed, 26 Aug 2020 18:29:09 +0800 Subject: [PATCH] [MS][LITE] arm cpu fp16 op: optimize conv depthwise --- .../lite/nnacl/assembly/opt/ConvDwFp16Row.S | 117 ++++++++++ mindspore/lite/nnacl/fp16/common_func.h | 49 ----- .../lite/nnacl/fp16/conv_depthwise_fp16.c | 58 ++++- .../lite/nnacl/fp16/conv_depthwise_fp16.h | 20 ++ mindspore/lite/nnacl/fp16/pack_fp16.c | 13 ++ mindspore/lite/nnacl/fp16/pack_fp16.h | 2 + .../arm/fp16/convolution_depthwise_fp16.cc | 99 ++------- .../arm/fp16/convolution_depthwise_fp16.h | 12 +- .../convolution_depthwise_slidewindow_fp16.cc | 203 ++++++++++++++++++ .../convolution_depthwise_slidewindow_fp16.h | 62 ++++++ .../arm/fp16/convolution_winograd_fp16.cc | 1 - 11 files changed, 496 insertions(+), 140 deletions(-) create mode 100644 mindspore/lite/nnacl/assembly/opt/ConvDwFp16Row.S delete mode 100644 mindspore/lite/nnacl/fp16/common_func.h create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h diff --git a/mindspore/lite/nnacl/assembly/opt/ConvDwFp16Row.S b/mindspore/lite/nnacl/assembly/opt/ConvDwFp16Row.S new file mode 100644 index 000000000..6cc0a2cf4 --- /dev/null +++ b/mindspore/lite/nnacl/assembly/opt/ConvDwFp16Row.S @@ -0,0 +1,117 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global ConvDwFp16Row +#ifndef __APPLE__ +.type ConvDwFp16Row, %function +#endif + +// void ConvDwFp16Row(float16_t* output_ptr, const float16_t* input_ptr,const float16_t* filter_ptr, +// size_t num_pixels, size_t input_channel, size_t input_step) +// x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels, +// x4: input_channel, x5: input_step +// +ConvDwFp16Row: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters +cmp x3, #0 +beq End + +mov x9, x0 +mov x12, #2 // sizeof(float16_t) +mul x5, x5, x12 + +LoopOutPixel: +mov x6, x1 +mov x7, x2 +mov x8, x4 + +LoopInputDepth32In: + cmp x8, #32 + blt Loop8 + sub x8, x8, #32 + + ld1 {v0.8h, v1.8h}, [x6], #32 + ld1 {v2.8h, v3.8h}, [x7], #32 + ld1 {v16.8h, v17.8h}, [x0], #32 + + cmp x8, #32 + blt LoopInputDepth32Out + LoopInputDepth32: + fmla v16.8h, v0.8h, v2.8h + fmla v17.8h, v1.8h, v3.8h + + st1 {v16.8h, v17.8h}, [x9], #32 + + ld1 {v4.8h, v5.8h}, [x6], #32 + ld1 {v6.8h, v7.8h}, [x7], #32 + ld1 {v18.8h, v19.8h}, [x0], #32 + + fmla v18.8h, v4.8h, v6.8h + fmla v19.8h, v5.8h, v7.8h + + st1 {v18.8h, v19.8h}, [x9], #32 + + ld1 {v0.8h, v1.8h}, [x6], #32 + ld1 {v2.8h, v3.8h}, [x7], #32 + ld1 {v16.8h, v17.8h}, [x0], #32 + + sub x8, x8, #32 + cmp x8, #32 + bge LoopInputDepth32 + + LoopInputDepth32Out: + fmla v16.8h, v0.8h, v2.8h + fmla v17.8h, v1.8h, v3.8h + st1 {v16.8h, v17.8h}, [x9], #32 + + ld1 {v4.8h, v5.8h}, [x6], #32 + ld1 {v6.8h, v7.8h}, [x7], #32 + ld1 {v18.8h, v19.8h}, [x0], #32 + + fmla v18.8h, v4.8h, v6.8h + fmla v19.8h, v5.8h, v7.8h + + st1 {v18.8h, v19.8h}, [x9], #32 + + Loop8: + cmp x8, #8 + blt L0 + + LoopInputDepth8: + ld1 {v0.8h}, [x6], #16 + ld1 {v2.8h}, [x7], #16 + ld1 {v16.8h}, [x0], #16 + fmla v16.8h, v0.8h, v2.8h + st1 {v16.8h}, [x9], #16 + sub x8, x8, #8 + cmp x8, #8 + bge LoopInputDepth8 + + L0: + cmp x8, #0 + beq Loop8LineEnd + + LoopInputDepth0: + ldr h0, [x6], #2 + ldr h1, [x7], #2 + ldr h2, [x0], #2 + fmul h0, h0, h1 + fadd h2, h2, h0 + str h2, [x9], #2 + subs x8, x8, #1 + bne LoopInputDepth0 + + Loop8LineEnd: + +subs x3, x3, #1 +add x1, x1, x5 +bne LoopOutPixel + +End: +ret + +#endif diff --git a/mindspore/lite/nnacl/fp16/common_func.h b/mindspore/lite/nnacl/fp16/common_func.h deleted file mode 100644 index 54e356c7b..000000000 --- a/mindspore/lite/nnacl/fp16/common_func.h +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_LITE_NNACL_FP16_COMMON_FUNC_H_ -#define MINDSPORE_LITE_NNACL_FP16_COMMON_FUNC_H_ - -#include -#include -#include -#include "nnacl/op_base.h" -#include "nnacl/conv_parameter.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef ENABLE_ARM64 -void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, - size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, - size_t relu6); -void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, - size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, - size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, - size_t relu, size_t relu6); -void DeconvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, - size_t in_kh_step, size_t in_kw_step, size_t kernel_w); -void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, - size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, - size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* MINDSPORE_LITE_NNACL_FP32_COMMON_FUNC_H_ */ diff --git a/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c b/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c index 49777ad01..35dfe0f46 100644 --- a/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c +++ b/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c @@ -15,8 +15,62 @@ */ #include "nnacl/fp16/conv_depthwise_fp16.h" -#include -#include "nnacl/fp16/common_func.h" +#include +#include "nnacl/fp16/activation_fp16.h" + +void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, + const float16_t *bias_data, const ConvParameter *conv_param, int task_id) { + int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_); + int h_start = h_step * task_id; + int h_end = MSMIN(h_start + h_step, conv_param->output_h_); + bool relu = conv_param->act_type_ == ActType_Relu; + bool relu6 = conv_param->act_type_ == ActType_Relu6; + for (int b = 0; b < conv_param->output_batch_; b++) { + const float16_t *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_; + float16_t *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_; + for (int oh = h_start; oh < h_end; oh++) { + float16_t *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_; + + int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_; + int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_)); + int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_)); + + for (int ow = 0; ow < conv_param->output_w_; ow++) { + memcpy(dst_data + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(float16_t)); + } + for (int kh = start_kh; kh < end_kh; kh++) { + int ih = ih_origin + conv_param->dilation_w_ * kh; + + const float16_t *src_kh = src + ih * conv_param->input_w_ * conv_param->input_channel_; + const float16_t *weight_kh = weight_data + kh * conv_param->kernel_w_ * conv_param->output_channel_; + + int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_; + for (int kw = 0; kw < conv_param->kernel_w_; kw++) { + int out_w_start = MSMAX( + 0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_); + int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ - + conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / + conv_param->stride_w_); + + float16_t *dst_w = dst_data + out_w_start * conv_param->output_channel_; + int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw; + + const float16_t *src_kw = src_kh + iw_origin * conv_param->input_channel_; + int num_pixels = out_w_end - out_w_start; + + ConvDwFp16Row(dst_w, src_kw, weight_kh, num_pixels, conv_param->output_channel_, in_sw_step); + weight_kh += conv_param->output_channel_; + } + } + if (relu) { + ReluFp16(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_); + } + if (relu6) { + Relu6Fp16(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_); + } + } + } +} /*conv depthwise fp16 begin*/ void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, diff --git a/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.h b/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.h index e70ebcfaf..b1ae0a12d 100644 --- a/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.h +++ b/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.h @@ -23,6 +23,26 @@ #ifdef __cplusplus extern "C" { #endif +#ifdef ENABLE_ARM64 +void ConvDwFp16Row(float16_t *output_ptr, const float16_t *input_ptr, const float16_t *filter_ptr, size_t num_pixels, + size_t input_channel, size_t input_step); +void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, + size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, + size_t relu6); +void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, + size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, + size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, + size_t relu, size_t relu6); +void DeconvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, + size_t in_kh_step, size_t in_kw_step, size_t kernel_w); +void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, + size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, + size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); +#endif + +void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, + const float16_t *bias_data, const ConvParameter *conv_param, int task_id); + void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id); diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.c b/mindspore/lite/nnacl/fp16/pack_fp16.c index b5ca307da..ab26faed6 100644 --- a/mindspore/lite/nnacl/fp16/pack_fp16.c +++ b/mindspore/lite/nnacl/fp16/pack_fp16.c @@ -220,6 +220,19 @@ void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int } } +void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel) { + for (int n = 0; n < batch; n++) { + for (int c = 0; c < channel; c++) { + for (int hw = 0; hw < plane; hw++) { + int nhwc_index = n * channel * plane + hw * channel + c; + int nchw_index = n * channel * plane + c * plane + hw; + ((float16_t *)(dst))[nhwc_index] = ((const float16_t *)(src))[nchw_index]; + } + } + } + return; +} + void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) { int ic4 = UP_DIV(channel, C4NUM); int nhwc4_batch_unit_offset = ic4 * C4NUM * plane; diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.h b/mindspore/lite/nnacl/fp16/pack_fp16.h index 493672b05..afc55f244 100644 --- a/mindspore/lite/nnacl/fp16/pack_fp16.h +++ b/mindspore/lite/nnacl/fp16/pack_fp16.h @@ -41,6 +41,8 @@ void PackNHWCToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int channel); +void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel); + void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel); void PackNHWCToNHWC8Fp16(const void *src, void *dst, int batch, int plane, int channel); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc index cf88b2be2..1f66d0bd1 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc @@ -15,6 +15,7 @@ */ #include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h" +#include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h" #include "nnacl/fp16/pack_fp16.h" #include "nnacl/fp16/cast_fp16.h" #include "schema/model_generated.h" @@ -30,72 +31,34 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D; namespace mindspore::kernel { ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() { - if (sliding_ != nullptr) { - delete sliding_; - sliding_ = nullptr; - } if (packed_weight_ != nullptr) { delete packed_weight_; packed_weight_ = nullptr; } - FreeTmpBuffer(); -} - -void ConvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() { - if (need_align_) { - if (packed_input_ != nullptr) { - delete packed_input_; - packed_input_ = nullptr; - } - if (packed_output_ != nullptr) { - delete packed_output_; - packed_output_ = nullptr; - } - } -} - -int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() { - if (conv_param_->input_channel_ % C4NUM != 0) { - need_align_ = true; - int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); - int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8; - packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(float16_t))); - if (packed_input_ == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed."; - return RET_ERROR; - } - - int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8; - packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(float16_t))); - if (packed_output_ == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed."; - return RET_ERROR; - } - } - return RET_OK; } int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { // init weight: o, h, w, i; o == group, i == 1 + ConvolutionBaseFP16CPUKernel::GetExecuteFilter(); + auto weight_tensor = in_tensors_[kWeightIndex]; - int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM); - auto origin_weight = reinterpret_cast(weight_tensor->Data()); - int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width(); + int channel = weight_tensor->Batch(); + int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width(); packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float16_t))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), - weight_tensor->Batch()); + PackNCHWToNHWCFp16(fp16_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), + weight_tensor->Batch()); - bias_data_ = reinterpret_cast(malloc(C8NUM * OC8 * sizeof(float16_t))); + bias_data_ = reinterpret_cast(malloc(channel * sizeof(float16_t))); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); + memset(bias_data_, 0, channel * sizeof(float16_t)); auto bias_fp16 = reinterpret_cast(bias_data_); if (in_tensors_.size() == kInputSize2) { auto bias_tensor = in_tensors_.at(kBiasIndex); @@ -104,18 +67,10 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { bias_fp16[i] = (float16_t)ori_bias[i]; } } - - conv_param_->thread_num_ = MSMIN(thread_count_, OC8); return RET_OK; } int ConvolutionDepthwiseFp16CPUKernel::Init() { - sliding_ = new (std::nothrow) SlidingWindowParam; - if (sliding_ == nullptr) { - MS_LOG(ERROR) << "new sliding window param failed."; - return RET_ERROR; - } - auto ret = InitWeightBias(); if (ret != 0) { MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed."; @@ -129,24 +84,17 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() { } int ConvolutionDepthwiseFp16CPUKernel::ReSize() { - FreeTmpBuffer(); auto ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { return ret; } - InitSlidingParamConvDw(sliding_, conv_param_, C8NUM); - - ret = InitBuffer(); - if (ret != 0) { - MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed."; - return RET_ERROR; - } + conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_); return RET_OK; } int ConvolutionDepthwiseFp16CPUKernel::Execute(int task_id) { - ConvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, - sliding_, task_id); + ConvDwFp16(execute_output_, execute_input_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, + task_id); return RET_OK; } @@ -176,25 +124,13 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() { MS_LOG(ERROR) << "Get Execute tensor failed."; return ret; } - if (need_align_) { - PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, - conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); - } else { - packed_input_ = execute_input_; - } - if (!need_align_) { - packed_output_ = execute_output_; - } ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]"; return RET_ERROR; } - if (need_align_) { - PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, - conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); - } + ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); return RET_OK; @@ -207,7 +143,14 @@ kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector(opParameter); + kernel::LiteKernel *kernel; + if (conv_param->input_channel_ < 32) { + kernel = + new (std::nothrow) kernel::ConvolutionDepthwiseSWFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive); + } else { + kernel = new (std::nothrow) kernel::ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive); + } if (kernel == nullptr) { MS_LOG(ERROR) << "kernel is nullptr."; return nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h index 2355fe819..ff0a3d031 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h @@ -25,14 +25,12 @@ #ifdef __cplusplus extern "C" { #endif -void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, - const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, - int task_id); +void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, + const float16_t *bias_data, const ConvParameter *conv_param, int task_id); #ifdef __cplusplus } #endif - namespace mindspore::kernel { class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { public: @@ -46,17 +44,11 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { int ReSize() override; int Run() override; - int InitBuffer(); int InitWeightBias(); int Execute(int task_id); private: - void FreeTmpBuffer(); - SlidingWindowParam *sliding_ = nullptr; float16_t *packed_weight_ = nullptr; - float16_t *packed_input_ = nullptr; - float16_t *packed_output_ = nullptr; - bool need_align_ = false; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc new file mode 100644 index 000000000..39f2eb7bf --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc @@ -0,0 +1,203 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h" +#include "nnacl/fp16/pack_fp16.h" +#include "nnacl/fp16/cast_fp16.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "include/errorcode.h" +#include "src/runtime/runtime_api.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_DepthwiseConv2D; + +namespace mindspore::kernel { +ConvolutionDepthwiseSWFp16CPUKernel::~ConvolutionDepthwiseSWFp16CPUKernel() { + if (sliding_ != nullptr) { + delete sliding_; + sliding_ = nullptr; + } + if (packed_weight_ != nullptr) { + delete packed_weight_; + packed_weight_ = nullptr; + } + FreeTmpBuffer(); +} + +void ConvolutionDepthwiseSWFp16CPUKernel::FreeTmpBuffer() { + if (need_align_) { + if (packed_input_ != nullptr) { + delete packed_input_; + packed_input_ = nullptr; + } + if (packed_output_ != nullptr) { + delete packed_output_; + packed_output_ = nullptr; + } + } +} + +int ConvolutionDepthwiseSWFp16CPUKernel::InitBuffer() { + if (conv_param_->input_channel_ % C4NUM != 0) { + need_align_ = true; + int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); + int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8; + packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(float16_t))); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + + int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8; + packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(float16_t))); + if (packed_output_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + } + return RET_OK; +} + +int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() { + // init weight: o, h, w, i; o == group, i == 1 + auto weight_tensor = in_tensors_[kWeightIndex]; + int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM); + auto origin_weight = reinterpret_cast(weight_tensor->Data()); + int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width(); + + packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float16_t))); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), + weight_tensor->Batch()); + + bias_data_ = reinterpret_cast(malloc(C8NUM * OC8 * sizeof(float16_t))); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); + auto bias_fp16 = reinterpret_cast(bias_data_); + if (in_tensors_.size() == kInputSize2) { + auto bias_tensor = in_tensors_.at(kBiasIndex); + auto ori_bias = reinterpret_cast(bias_tensor->Data()); + for (int i = 0; i < bias_tensor->ElementsNum(); i++) { + bias_fp16[i] = (float16_t)ori_bias[i]; + } + } + + conv_param_->thread_num_ = MSMIN(thread_count_, OC8); + return RET_OK; +} + +int ConvolutionDepthwiseSWFp16CPUKernel::Init() { + sliding_ = new (std::nothrow) SlidingWindowParam; + if (sliding_ == nullptr) { + MS_LOG(ERROR) << "new sliding window param failed."; + return RET_ERROR; + } + + auto ret = InitWeightBias(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed."; + return RET_ERROR; + } + + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int ConvolutionDepthwiseSWFp16CPUKernel::ReSize() { + FreeTmpBuffer(); + auto ret = ConvolutionBaseCPUKernel::Init(); + if (ret != RET_OK) { + return ret; + } + InitSlidingParamConvDw(sliding_, conv_param_, C8NUM); + + ret = InitBuffer(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed."; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionDepthwiseSWFp16CPUKernel::Execute(int task_id) { + ConvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, + sliding_, task_id); + return RET_OK; +} + +static int ConvDwSWFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { + auto conv_dw_fp16 = reinterpret_cast(cdata); + auto ret = conv_dw_fp16->Execute(task_id); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvolutionDepthwiseSWFp16Run error task_id[" << task_id << "] error_code[" << ret << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionDepthwiseSWFp16CPUKernel::Run() { + auto ret = Prepare(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Prepare failed."; + return RET_ERROR; + } + if (conv_param_->input_channel_ != conv_param_->output_channel_) { + MS_LOG(ERROR) << "Only support input channel equals output channel."; + return RET_ERROR; + } + + ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Get Execute tensor failed."; + return ret; + } + if (need_align_) { + PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); + } else { + packed_input_ = execute_input_; + } + if (!need_align_) { + packed_output_ = execute_output_; + } + + ret = LiteBackendParallelLaunch(ConvDwSWFp16Run, this, conv_param_->thread_num_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvDwSWFp16Run error: error_code[" << ret << "]"; + return RET_ERROR; + } + if (need_align_) { + PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + } + ConvolutionBaseFP16CPUKernel::IfCastOutput(); + ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); + return RET_OK; +} + +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h new file mode 100644 index 000000000..dce8aeb46 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h @@ -0,0 +1,62 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_ + +#include +#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" +#include "nnacl/fp16/conv_depthwise_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif +void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, + const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, + int task_id); +#ifdef __cplusplus +} +#endif + +namespace mindspore::kernel { +class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { + public: + ConvolutionDepthwiseSWFp16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const Context *ctx, + const mindspore::lite::PrimitiveC *primitive) + : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~ConvolutionDepthwiseSWFp16CPUKernel() override; + + int Init() override; + int ReSize() override; + int Run() override; + + int InitBuffer(); + int InitWeightBias(); + int Execute(int task_id); + + private: + void FreeTmpBuffer(); + SlidingWindowParam *sliding_ = nullptr; + float16_t *packed_weight_ = nullptr; + float16_t *packed_input_ = nullptr; + float16_t *packed_output_ = nullptr; + bool need_align_ = false; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc index ff547040f..cb7526f82 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc @@ -17,7 +17,6 @@ #include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h" #include "src/runtime/kernel/arm/fp16/matrix_fp16.h" #include "nnacl/fp16/conv_fp16.h" -#include "nnacl/fp16/common_func.h" #include "nnacl/fp16/cast_fp16.h" #include "nnacl/fp16/pack_fp16.h" #include "nnacl/fp16/winograd_transform_fp16.h" -- GitLab