提交 5802b1f8 编写于 作者: Y yangruoqi713

[MS][LITE] arm cpu fp16 op: optimize conv depthwise

上级 8b5c3521
#ifdef __aarch64__
.text
.align 5
.global ConvDwFp16Row
#ifndef __APPLE__
.type ConvDwFp16Row, %function
#endif
// void ConvDwFp16Row(float16_t* output_ptr, const float16_t* input_ptr,const float16_t* filter_ptr,
// size_t num_pixels, size_t input_channel, size_t input_step)
// x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels,
// x4: input_channel, x5: input_step
//
ConvDwFp16Row:
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
cmp x3, #0
beq End
mov x9, x0
mov x12, #2 // sizeof(float16_t)
mul x5, x5, x12
LoopOutPixel:
mov x6, x1
mov x7, x2
mov x8, x4
LoopInputDepth32In:
cmp x8, #32
blt Loop8
sub x8, x8, #32
ld1 {v0.8h, v1.8h}, [x6], #32
ld1 {v2.8h, v3.8h}, [x7], #32
ld1 {v16.8h, v17.8h}, [x0], #32
cmp x8, #32
blt LoopInputDepth32Out
LoopInputDepth32:
fmla v16.8h, v0.8h, v2.8h
fmla v17.8h, v1.8h, v3.8h
st1 {v16.8h, v17.8h}, [x9], #32
ld1 {v4.8h, v5.8h}, [x6], #32
ld1 {v6.8h, v7.8h}, [x7], #32
ld1 {v18.8h, v19.8h}, [x0], #32
fmla v18.8h, v4.8h, v6.8h
fmla v19.8h, v5.8h, v7.8h
st1 {v18.8h, v19.8h}, [x9], #32
ld1 {v0.8h, v1.8h}, [x6], #32
ld1 {v2.8h, v3.8h}, [x7], #32
ld1 {v16.8h, v17.8h}, [x0], #32
sub x8, x8, #32
cmp x8, #32
bge LoopInputDepth32
LoopInputDepth32Out:
fmla v16.8h, v0.8h, v2.8h
fmla v17.8h, v1.8h, v3.8h
st1 {v16.8h, v17.8h}, [x9], #32
ld1 {v4.8h, v5.8h}, [x6], #32
ld1 {v6.8h, v7.8h}, [x7], #32
ld1 {v18.8h, v19.8h}, [x0], #32
fmla v18.8h, v4.8h, v6.8h
fmla v19.8h, v5.8h, v7.8h
st1 {v18.8h, v19.8h}, [x9], #32
Loop8:
cmp x8, #8
blt L0
LoopInputDepth8:
ld1 {v0.8h}, [x6], #16
ld1 {v2.8h}, [x7], #16
ld1 {v16.8h}, [x0], #16
fmla v16.8h, v0.8h, v2.8h
st1 {v16.8h}, [x9], #16
sub x8, x8, #8
cmp x8, #8
bge LoopInputDepth8
L0:
cmp x8, #0
beq Loop8LineEnd
LoopInputDepth0:
ldr h0, [x6], #2
ldr h1, [x7], #2
ldr h2, [x0], #2
fmul h0, h0, h1
fadd h2, h2, h0
str h2, [x9], #2
subs x8, x8, #1
bne LoopInputDepth0
Loop8LineEnd:
subs x3, x3, #1
add x1, x1, x5
bne LoopOutPixel
End:
ret
#endif
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_NNACL_FP16_COMMON_FUNC_H_
#define MINDSPORE_LITE_NNACL_FP16_COMMON_FUNC_H_
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "nnacl/op_base.h"
#include "nnacl/conv_parameter.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef ENABLE_ARM64
void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu,
size_t relu6);
void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step,
size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step,
size_t relu, size_t relu6);
void DeconvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
size_t in_kh_step, size_t in_kw_step, size_t kernel_w);
void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
#endif
#ifdef __cplusplus
}
#endif
#endif /* MINDSPORE_LITE_NNACL_FP32_COMMON_FUNC_H_ */
......@@ -15,8 +15,62 @@
*/
#include "nnacl/fp16/conv_depthwise_fp16.h"
#include <arm_neon.h>
#include "nnacl/fp16/common_func.h"
#include <string.h>
#include "nnacl/fp16/activation_fp16.h"
void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
const float16_t *bias_data, const ConvParameter *conv_param, int task_id) {
int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
int h_start = h_step * task_id;
int h_end = MSMIN(h_start + h_step, conv_param->output_h_);
bool relu = conv_param->act_type_ == ActType_Relu;
bool relu6 = conv_param->act_type_ == ActType_Relu6;
for (int b = 0; b < conv_param->output_batch_; b++) {
const float16_t *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
float16_t *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
for (int oh = h_start; oh < h_end; oh++) {
float16_t *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_;
int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_;
int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_));
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));
for (int ow = 0; ow < conv_param->output_w_; ow++) {
memcpy(dst_data + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(float16_t));
}
for (int kh = start_kh; kh < end_kh; kh++) {
int ih = ih_origin + conv_param->dilation_w_ * kh;
const float16_t *src_kh = src + ih * conv_param->input_w_ * conv_param->input_channel_;
const float16_t *weight_kh = weight_data + kh * conv_param->kernel_w_ * conv_param->output_channel_;
int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_;
for (int kw = 0; kw < conv_param->kernel_w_; kw++) {
int out_w_start = MSMAX(
0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ -
conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) /
conv_param->stride_w_);
float16_t *dst_w = dst_data + out_w_start * conv_param->output_channel_;
int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw;
const float16_t *src_kw = src_kh + iw_origin * conv_param->input_channel_;
int num_pixels = out_w_end - out_w_start;
ConvDwFp16Row(dst_w, src_kw, weight_kh, num_pixels, conv_param->output_channel_, in_sw_step);
weight_kh += conv_param->output_channel_;
}
}
if (relu) {
ReluFp16(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_);
}
if (relu6) {
Relu6Fp16(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_);
}
}
}
}
/*conv depthwise fp16 begin*/
void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
......
......@@ -23,6 +23,26 @@
#ifdef __cplusplus
extern "C" {
#endif
#ifdef ENABLE_ARM64
void ConvDwFp16Row(float16_t *output_ptr, const float16_t *input_ptr, const float16_t *filter_ptr, size_t num_pixels,
size_t input_channel, size_t input_step);
void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu,
size_t relu6);
void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step,
size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step,
size_t relu, size_t relu6);
void DeconvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
size_t in_kh_step, size_t in_kw_step, size_t kernel_w);
void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
#endif
void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
const float16_t *bias_data, const ConvParameter *conv_param, int task_id);
void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
int task_id);
......
......@@ -220,6 +220,19 @@ void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
}
}
void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel) {
for (int n = 0; n < batch; n++) {
for (int c = 0; c < channel; c++) {
for (int hw = 0; hw < plane; hw++) {
int nhwc_index = n * channel * plane + hw * channel + c;
int nchw_index = n * channel * plane + c * plane + hw;
((float16_t *)(dst))[nhwc_index] = ((const float16_t *)(src))[nchw_index];
}
}
}
return;
}
void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
int ic4 = UP_DIV(channel, C4NUM);
int nhwc4_batch_unit_offset = ic4 * C4NUM * plane;
......
......@@ -41,6 +41,8 @@ void PackNHWCToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int channel);
void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNHWC8Fp16(const void *src, void *dst, int batch, int plane, int channel);
......
......@@ -15,6 +15,7 @@
*/
#include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "schema/model_generated.h"
......@@ -30,72 +31,34 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel {
ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() {
if (sliding_ != nullptr) {
delete sliding_;
sliding_ = nullptr;
}
if (packed_weight_ != nullptr) {
delete packed_weight_;
packed_weight_ = nullptr;
}
FreeTmpBuffer();
}
void ConvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
if (need_align_) {
if (packed_input_ != nullptr) {
delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
}
}
int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() {
if (conv_param_->input_channel_ % C4NUM != 0) {
need_align_ = true;
int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
if (packed_output_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
}
return RET_OK;
}
int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1
ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
auto weight_tensor = in_tensors_[kWeightIndex];
int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
int channel = weight_tensor->Batch();
int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width();
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
weight_tensor->Batch());
PackNCHWToNHWCFp16(fp16_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
weight_tensor->Batch());
bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
bias_data_ = reinterpret_cast<float16_t *>(malloc(channel * sizeof(float16_t)));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
memset(bias_data_, 0, channel * sizeof(float16_t));
auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
if (in_tensors_.size() == kInputSize2) {
auto bias_tensor = in_tensors_.at(kBiasIndex);
......@@ -104,18 +67,10 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
bias_fp16[i] = (float16_t)ori_bias[i];
}
}
conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
return RET_OK;
}
int ConvolutionDepthwiseFp16CPUKernel::Init() {
sliding_ = new (std::nothrow) SlidingWindowParam;
if (sliding_ == nullptr) {
MS_LOG(ERROR) << "new sliding window param failed.";
return RET_ERROR;
}
auto ret = InitWeightBias();
if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
......@@ -129,24 +84,17 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() {
}
int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
FreeTmpBuffer();
auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
return ret;
}
InitSlidingParamConvDw(sliding_, conv_param_, C8NUM);
ret = InitBuffer();
if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
return RET_ERROR;
}
conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_);
return RET_OK;
}
int ConvolutionDepthwiseFp16CPUKernel::Execute(int task_id) {
ConvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
sliding_, task_id);
ConvDwFp16(execute_output_, execute_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
task_id);
return RET_OK;
}
......@@ -176,25 +124,13 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
MS_LOG(ERROR) << "Get Execute tensor failed.";
return ret;
}
if (need_align_) {
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
} else {
packed_input_ = execute_input_;
}
if (!need_align_) {
packed_output_ = execute_output_;
}
ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
return RET_ERROR;
}
if (need_align_) {
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK;
......@@ -207,7 +143,14 @@ kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::tensor::T
const mindspore::lite::PrimitiveC *primitive) {
MS_ASSERT(opParameter != nullptr);
MS_ASSERT(desc.type == schema::PrimitiveType_DepthwiseConv2D);
auto kernel = new (std::nothrow) ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
kernel::LiteKernel *kernel;
if (conv_param->input_channel_ < 32) {
kernel =
new (std::nothrow) kernel::ConvolutionDepthwiseSWFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
} else {
kernel = new (std::nothrow) kernel::ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
}
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel is nullptr.";
return nullptr;
......
......@@ -25,14 +25,12 @@
#ifdef __cplusplus
extern "C" {
#endif
void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
int task_id);
void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
const float16_t *bias_data, const ConvParameter *conv_param, int task_id);
#ifdef __cplusplus
}
#endif
namespace mindspore::kernel {
class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public:
......@@ -46,17 +44,11 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int ReSize() override;
int Run() override;
int InitBuffer();
int InitWeightBias();
int Execute(int task_id);
private:
void FreeTmpBuffer();
SlidingWindowParam *sliding_ = nullptr;
float16_t *packed_weight_ = nullptr;
float16_t *packed_input_ = nullptr;
float16_t *packed_output_ = nullptr;
bool need_align_ = false;
};
} // namespace mindspore::kernel
......
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "src/runtime/runtime_api.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel {
ConvolutionDepthwiseSWFp16CPUKernel::~ConvolutionDepthwiseSWFp16CPUKernel() {
if (sliding_ != nullptr) {
delete sliding_;
sliding_ = nullptr;
}
if (packed_weight_ != nullptr) {
delete packed_weight_;
packed_weight_ = nullptr;
}
FreeTmpBuffer();
}
void ConvolutionDepthwiseSWFp16CPUKernel::FreeTmpBuffer() {
if (need_align_) {
if (packed_input_ != nullptr) {
delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
}
}
int ConvolutionDepthwiseSWFp16CPUKernel::InitBuffer() {
if (conv_param_->input_channel_ % C4NUM != 0) {
need_align_ = true;
int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
if (packed_output_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
}
return RET_OK;
}
int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1
auto weight_tensor = in_tensors_[kWeightIndex];
int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
weight_tensor->Batch());
bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
if (in_tensors_.size() == kInputSize2) {
auto bias_tensor = in_tensors_.at(kBiasIndex);
auto ori_bias = reinterpret_cast<float *>(bias_tensor->Data());
for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
bias_fp16[i] = (float16_t)ori_bias[i];
}
}
conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
return RET_OK;
}
int ConvolutionDepthwiseSWFp16CPUKernel::Init() {
sliding_ = new (std::nothrow) SlidingWindowParam;
if (sliding_ == nullptr) {
MS_LOG(ERROR) << "new sliding window param failed.";
return RET_ERROR;
}
auto ret = InitWeightBias();
if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int ConvolutionDepthwiseSWFp16CPUKernel::ReSize() {
FreeTmpBuffer();
auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
return ret;
}
InitSlidingParamConvDw(sliding_, conv_param_, C8NUM);
ret = InitBuffer();
if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
return RET_ERROR;
}
return RET_OK;
}
int ConvolutionDepthwiseSWFp16CPUKernel::Execute(int task_id) {
ConvDwC8Fp16(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
sliding_, task_id);
return RET_OK;
}
static int ConvDwSWFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
auto conv_dw_fp16 = reinterpret_cast<ConvolutionDepthwiseSWFp16CPUKernel *>(cdata);
auto ret = conv_dw_fp16->Execute(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionDepthwiseSWFp16Run error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR;
}
return RET_OK;
}
int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
auto ret = Prepare();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Prepare failed.";
return RET_ERROR;
}
if (conv_param_->input_channel_ != conv_param_->output_channel_) {
MS_LOG(ERROR) << "Only support input channel equals output channel.";
return RET_ERROR;
}
ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute tensor failed.";
return ret;
}
if (need_align_) {
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
} else {
packed_input_ = execute_input_;
}
if (!need_align_) {
packed_output_ = execute_output_;
}
ret = LiteBackendParallelLaunch(ConvDwSWFp16Run, this, conv_param_->thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvDwSWFp16Run error: error_code[" << ret << "]";
return RET_ERROR;
}
if (need_align_) {
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK;
}
} // namespace mindspore::kernel
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "nnacl/fp16/conv_depthwise_fp16.h"
#ifdef __cplusplus
extern "C" {
#endif
void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
int task_id);
#ifdef __cplusplus
}
#endif
namespace mindspore::kernel {
class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public:
ConvolutionDepthwiseSWFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDepthwiseSWFp16CPUKernel() override;
int Init() override;
int ReSize() override;
int Run() override;
int InitBuffer();
int InitWeightBias();
int Execute(int task_id);
private:
void FreeTmpBuffer();
SlidingWindowParam *sliding_ = nullptr;
float16_t *packed_weight_ = nullptr;
float16_t *packed_input_ = nullptr;
float16_t *packed_output_ = nullptr;
bool need_align_ = false;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_
......@@ -17,7 +17,6 @@
#include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h"
#include "src/runtime/kernel/arm/fp16/matrix_fp16.h"
#include "nnacl/fp16/conv_fp16.h"
#include "nnacl/fp16/common_func.h"
#include "nnacl/fp16/cast_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "nnacl/fp16/winograd_transform_fp16.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册