提交 8d06c2b8 编写于 作者: Y yangruoqi713

[MS][LITE] optimize arm cpu int8 op conv depthwise: add common and slide window functions to select

上级 e6112ed1
......@@ -29,7 +29,7 @@ mov x6, x1
mov x7, x2
mov x8, x4
LoopInputDepth16In:
LoopDepth16In:
cmp x8, #16
blt L4
sub x8, x8, #16
......@@ -39,8 +39,8 @@ mov x8, x4
ld1 {v16.4s, v17.4s}, [x0], #32
cmp x8, #16
blt LoopInputDepth16Out
LoopInputDepth16:
blt LoopDepth16Out
LoopDepth16:
fmla v16.4s, v0.4s, v2.4s
fmla v17.4s, v1.4s, v3.4s
......@@ -61,9 +61,9 @@ mov x8, x4
sub x8, x8, #16
cmp x8, #16
bge LoopInputDepth16
bge LoopDepth16
LoopInputDepth16Out:
LoopDepth16Out:
fmla v16.4s, v0.4s, v2.4s
fmla v17.4s, v1.4s, v3.4s
st1 {v16.4s, v17.4s}, [x9], #32
......@@ -81,7 +81,7 @@ mov x8, x4
cmp x8, #4
blt L0
LoopInputDepth4:
LoopDepth4:
ld1 {v0.4s}, [x6], #16
ld1 {v2.4s}, [x7], #16
ld1 {v16.4s}, [x0], #16
......@@ -89,13 +89,13 @@ mov x8, x4
st1 {v16.4s}, [x9], #16
sub x8, x8, #4
cmp x8, #4
bge LoopInputDepth4
bge LoopDepth4
L0:
cmp x8, #0
beq Loop16LineEnd
LoopInputDepth0:
LoopDepth0:
ldr s0, [x6], #4
ldr s1, [x7], #4
ldr s2, [x0], #4
......@@ -103,7 +103,7 @@ mov x8, x4
fadd s2, s2, s0
str s2, [x9], #4
subs x8, x8, #1
bne LoopInputDepth0
bne LoopDepth0
Loop16LineEnd:
......
#ifdef __aarch64__
.text
.align 5
.global ConvDwInt8PostAlign4
#ifndef __APPLE__
.type ConvDwInt8PostAlign4, %function
#endif
// void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
// int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
// x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
// x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max
ConvDwInt8PostAlign4:
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
ldr x8, [sp]
dup v26.4s, w5
dup v27.4s, w4
dup v28.4s, w6
dup v29.4s, w3
dup v30.4s, w7
dup v31.4s, w8
cmp x2, 16
blt LoopDepth8
LoopDepth16:
ld1 {v0.4s}, [x1], #16
ld1 {v1.4s}, [x1], #16
ld1 {v2.4s}, [x1], #16
ld1 {v3.4s}, [x1], #16
sqshl v0.4s, v0.4s, v26.4s
sqshl v1.4s, v1.4s, v26.4s
sqshl v2.4s, v2.4s, v26.4s
sqshl v3.4s, v3.4s, v26.4s
sqrdmulh v0.4s, v0.4s, v27.4s
sqrdmulh v1.4s, v1.4s, v27.4s
sqrdmulh v2.4s, v2.4s, v27.4s
sqrdmulh v3.4s, v3.4s, v27.4s
and v16.16b, v28.16b, v0.16b
sshr v16.4s, v16.4s, #31
sqadd v0.4s, v0.4s, v16.4s
srshl v0.4s, v0.4s, v28.4s
and v17.16b, v28.16b, v1.16b
sshr v17.4s, v17.4s, #31
sqadd v1.4s, v1.4s, v17.4s
srshl v1.4s, v1.4s, v28.4s
and v18.16b, v28.16b, v2.16b
sshr v18.4s, v18.4s, #31
sqadd v2.4s, v2.4s, v18.4s
srshl v2.4s, v2.4s, v28.4s
and v19.16b, v28.16b, v3.16b
sshr v19.4s, v19.4s, #31
sqadd v3.4s, v3.4s, v19.4s
srshl v3.4s, v3.4s, v28.4s
add v0.4s, v0.4s, v29.4s
add v1.4s, v1.4s, v29.4s
add v2.4s, v2.4s, v29.4s
add v3.4s, v3.4s, v29.4s
smax v0.4s, v0.4s, v30.4s
smax v1.4s, v1.4s, v30.4s
smax v2.4s, v2.4s, v30.4s
smax v3.4s, v3.4s, v30.4s
smin v0.4s, v0.4s, v31.4s
smin v1.4s, v1.4s, v31.4s
smin v2.4s, v2.4s, v31.4s
smin v3.4s, v3.4s, v31.4s
sqxtn v0.4h, v0.4s
sqxtn v1.4h, v1.4s
sqxtn v2.4h, v2.4s
sqxtn v3.4h, v3.4s
sqxtn v0.8b, v0.8h
sqxtn v1.8b, v1.8h
sqxtn v2.8b, v2.8h
sqxtn v3.8b, v3.8h
st1 {v0.s}[0], [x0], #4
st1 {v1.s}[0], [x0], #4
st1 {v2.s}[0], [x0], #4
st1 {v3.s}[0], [x0], #4
sub x2, x2, #16
cmp x2, #16
bge LoopDepth16
LoopDepth8:
cmp x2, #8
blt LoopDepth4
ld1 {v0.4s}, [x1], #16
ld1 {v1.4s}, [x1], #16
sqshl v0.4s, v0.4s, v26.4s
sqshl v1.4s, v1.4s, v26.4s
sqrdmulh v0.4s, v0.4s, v27.4s
sqrdmulh v1.4s, v1.4s, v27.4s
and v16.16b, v28.16b, v0.16b
sshr v16.4s, v16.4s, #31
sqadd v0.4s, v0.4s, v16.4s
srshl v0.4s, v0.4s, v28.4s
and v17.16b, v28.16b, v1.16b
sshr v17.4s, v17.4s, #31
sqadd v1.4s, v1.4s, v17.4s
srshl v1.4s, v1.4s, v28.4s
add v0.4s, v0.4s, v29.4s
add v1.4s, v1.4s, v29.4s
smax v0.4s, v0.4s, v30.4s
smax v1.4s, v1.4s, v30.4s
smin v0.4s, v0.4s, v31.4s
smin v1.4s, v1.4s, v31.4s
sqxtn v0.4h, v0.4s
sqxtn v1.4h, v1.4s
sqxtn v0.8b, v0.8h
sqxtn v1.8b, v1.8h
st1 {v0.s}[0], [x0], #4
st1 {v1.s}[0], [x0], #4
sub x2, x2, #8
cmp x2, #8
bge LoopDepth8
LoopDepth4:
cmp x2, #4
blt End
ld1 {v0.4s}, [x1], #16
sqshl v0.4s, v0.4s, v26.4s
sqrdmulh v0.4s, v0.4s, v27.4s
and v16.16b, v28.16b, v0.16b
sshr v16.4s, v16.4s, #31
sqadd v0.4s, v0.4s, v16.4s
srshl v0.4s, v0.4s, v28.4s
add v0.4s, v0.4s, v29.4s
smax v0.4s, v0.4s, v30.4s
smin v0.4s, v0.4s, v31.4s
sqxtn v0.4h, v0.4s
sqxtn v0.8b, v0.8h
st1 {v0.s}[0], [x0], #4
sub x2, x2, #4
bge LoopDepth4
End:
ret
#endif
#ifdef __aarch64__
.text
.align 5
.global ConvDwInt8Row
#ifndef __APPLE__
.type ConvDwInt8Row, %function
#endif
// void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
// int output_channel, int input_step, int8_t input_zp)
// x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels,
// x4: output_channel, x5: input_step, x6: input_zp
//
ConvDwInt8Row:
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
cmp x3, #0
beq End
mov x10, x0
dup v31.8b, w6
LoopOutPixel:
mov x7, x1
mov x8, x2
mov x9, x4
LoopDepth16In:
cmp x9, #16
blt L8
sub x9, x9, #16
ld1 {v0.8b, v1.8b}, [x7], #16
ld1 {v2.8h, v3.8h}, [x8], #32
ld1 {v16.4s, v17.4s}, [x0], #32
ssubl v20.8h, v0.8b, v31.8b
smlal v16.4s, v20.4h, v2.4h
smlal2 v17.4s, v20.8h, v2.8h
cmp x9, #16
blt LoopDepth16Out
LoopDepth16:
st1 {v16.4s, v17.4s}, [x10], #32
ld1 {v18.4s, v19.4s}, [x0], #32
ssubl v21.8h, v1.8b, v31.8b
smlal v18.4s, v21.4h, v3.4h
smlal2 v19.4s, v21.8h, v3.8h
st1 {v18.4s, v19.4s}, [x10], #32
ld1 {v0.8b, v1.8b}, [x7], #16
ld1 {v2.8h, v3.8h}, [x8], #32
ld1 {v16.4s, v17.4s}, [x0], #32
ssubl v20.8h, v0.8b, v31.8b
smlal v16.4s, v20.4h, v2.4h
smlal2 v17.4s, v20.8h, v2.8h
sub x9, x9, #16
cmp x9, #16
bge LoopDepth16
LoopDepth16Out:
st1 {v16.4s, v17.4s}, [x10], #32
ld1 {v18.4s, v19.4s}, [x0], #32
ssubl v21.8h, v1.8b, v31.8b
smlal v18.4s, v21.4h, v3.4h
smlal2 v19.4s, v21.8h, v3.8h
st1 {v18.4s, v19.4s}, [x10], #32
L8:
cmp x9, #8
blt L0
LoopDepth8:
ld1 {v0.8b}, [x7], #8
ld1 {v2.8h}, [x8], #16
ld1 {v16.4s, v17.4s}, [x0], #32
ssubl v20.8h, v0.8b, v31.8b
smlal v16.4s, v20.4h, v2.4h
smlal2 v17.4s, v20.8h, v2.8h
st1 {v16.4s, v17.4s}, [x10], #32
sub x9, x9, #8
cmp x9, #8
bge LoopDepth8
L0:
cmp x9, #0
beq Loop16LineEnd
LoopDepth0:
ldrsb w14, [x7], #1
ldrsh w15, [x8], #2
ldr w16, [x0], #4
add w14, w14, w6
sxth w14, w14
madd w14, w14, w15, w16
str w14, [x10], #4
subs x9, x9, #1
bne LoopDepth0
Loop16LineEnd:
subs x3, x3, #1
add x1, x1, x5
bne LoopOutPixel
End:
ret
#endif
......@@ -49,6 +49,10 @@ void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, co
size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier,
int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
int output_channel, int input_step, int8_t input_zp);
void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
#endif
#ifdef __cplusplus
......
......@@ -20,6 +20,99 @@
#include "nnacl/int8/common_func.h"
/*conv depthwise int8 begin*/
// only support perlayer
#ifndef ENABLE_ARM64
void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
int output_channel, int input_step, int8_t input_zp) {
for (int i = 0; i < num_pixels; i++) {
for (int c = 0; c < output_channel; c++) {
const int16_t input = input_ptr[c] - input_zp;
*output_ptr++ += input * weight_ptr[c];
}
input_ptr += input_step;
}
}
#endif
void ConvDwInt8Post(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max) {
int align_num = 0;
#ifdef ENABLE_ARM64
align_num = num_pixels / 4 * 4;
ConvDwInt8PostAlign4(dst, buffer, align_num, output_zp, out_multiplier, left_shift, right_shift, acc_min, acc_max);
#endif
for (int i = align_num; i < num_pixels; i++) {
buffer[i] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(buffer[i] * (1 << (unsigned int)left_shift), out_multiplier), -right_shift);
buffer[i] += output_zp;
buffer[i] = MSMAX(buffer[i], acc_min);
buffer[i] = MSMIN(buffer[i], acc_max);
dst[i] = (buffer[i]);
}
}
void ConvDwInt8(int8_t *output_data, int32_t *row_buffer, const int8_t *input_data, const int16_t *weight_data,
const int32_t *bias_data, const ConvParameter *conv_param, int task_id) {
int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
int h_start = h_step * task_id;
int h_end = MSMIN(h_start + h_step, conv_param->output_h_);
int out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_[0];
int left_shift = conv_param->conv_quant_arg_.left_shift_[0];
int right_shift = conv_param->conv_quant_arg_.right_shift_[0];
int intput_zp = conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
int output_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_;
int acc_min = conv_param->conv_quant_arg_.out_act_min_[0];
int acc_max = conv_param->conv_quant_arg_.out_act_max_[0];
for (int b = 0; b < conv_param->output_batch_; b++) {
const int8_t *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
int8_t *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
for (int oh = h_start; oh < h_end; oh++) {
int8_t *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_;
int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_;
int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_));
int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));
// init acc
for (int ow = 0; ow < conv_param->output_w_; ow++) {
memcpy(row_buffer + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(int32_t));
}
for (int kh = start_kh; kh < end_kh; kh++) {
int ih = ih_origin + conv_param->dilation_w_ * kh;
const int8_t *src_kh = src + ih * conv_param->input_w_ * conv_param->input_channel_;
const int16_t *weight_kh = weight_data + kh * conv_param->kernel_w_ * conv_param->output_channel_;
int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_;
for (int kw = 0; kw < conv_param->kernel_w_; kw++) {
int out_w_start = MSMAX(
0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ -
conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) /
conv_param->stride_w_);
int32_t *acc_w = row_buffer + out_w_start * conv_param->output_channel_;
int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw;
const int8_t *src_kw = src_kh + iw_origin * conv_param->input_channel_;
int num_pixels = out_w_end - out_w_start;
ConvDwInt8Row(acc_w, src_kw, weight_kh, num_pixels, conv_param->output_channel_, in_sw_step, intput_zp);
weight_kh += conv_param->output_channel_;
}
}
// post func, acc int32 -> dst int8
ConvDwInt8Post(dst_data, row_buffer, conv_param->output_w_ * conv_param->output_channel_, output_zp,
out_multiplier, left_shift, right_shift, acc_min, acc_max);
}
}
}
/*conv depthwise int8 end*/
/*conv depthwise sliding window int8 begin*/
void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
int width, int in_kh_step, int in_kw_step, int kernel_w, int *out_multiplier,
int *left_shift, int *right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max,
......@@ -153,8 +246,8 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
}
#endif
void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
const int16_t *src = input_data;
int8_t *dst = output_data;
bool per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
......@@ -215,7 +308,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
} // batch loop
// output nhwc4
}
/*conv depthwise int8 end*/
/*conv depthwise sliding window int8 end*/
/*deconv depthwise int8 begin*/
void DeconvDepthwiseBorderPixelInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width,
......
......@@ -23,8 +23,12 @@
#ifdef __cplusplus
extern "C" {
#endif
void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);
void ConvDwInt8(int8_t *output_data, int32_t *output_row, const int8_t *input_data, const int16_t *weight_data,
const int32_t *bias_data, const ConvParameter *conv_param, int task_id);
void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);
void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *input_data, const int16_t *weight_data,
const int32_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
......
......@@ -15,6 +15,7 @@
*/
#include "src/runtime/kernel/arm/int8/convolution_depthwise_int8.h"
#include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
......@@ -29,10 +30,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel {
ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
if (sliding != nullptr) {
delete sliding;
sliding = nullptr;
}
if (packed_weight_ != nullptr) {
free(packed_weight_);
packed_weight_ = nullptr;
......@@ -42,63 +39,44 @@ ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
// init weight, int8 -> int16
// o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
auto weight_tensor = in_tensors_[kWeightIndex];
auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
int channel = weight_tensor->Batch();
int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width();
auto tmp_weight = reinterpret_cast<int8_t *>(malloc(pack_weight_size * sizeof(int8_t)));
if (tmp_weight == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
PackNCHWToNHWCInt8(origin_weight, tmp_weight, 1, weight_tensor->Height() * weight_tensor->Width(),
weight_tensor->Batch());
int weight_zp = conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_;
packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
for (int i = 0; i < weight_tensor->ElementsNum(); i++) {
packed_weight_[i] = (int16_t)(tmp_weight[i] - weight_zp);
}
bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
bias_data_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
memset(bias_data_, 0, channel * sizeof(int32_t));
if (in_tensors_.size() == kInputSize2) {
auto bias_tensor = in_tensors_.at(kBiasIndex);
auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
}
conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
return RET_OK;
}
int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
UP_DIV(conv_param_->input_channel_, 4);
packed_input_ = reinterpret_cast<int16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(int16_t)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
if (conv_param_->input_channel_ % C4NUM != 0) {
need_align_ = true;
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
UP_DIV(conv_param_->output_channel_, C4NUM);
packed_output_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(pack_output_size * sizeof(int8_t)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
}
return RET_OK;
}
int ConvolutionDepthwiseInt8CPUKernel::Init() {
sliding = new (std::nothrow) SlidingWindowParam;
if (sliding == nullptr) {
MS_LOG(ERROR) << "new sliding window param.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
......@@ -107,13 +85,12 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
ConvolutionBaseCPUKernel::Init();
InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Set quant param failed.";
return ret;
}
conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_);
ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!";
......@@ -123,8 +100,9 @@ int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
}
int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
ConvDwInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
sliding, task_id);
auto buffer = row_buffer_ + conv_param_->output_w_ * conv_param_->output_channel_ * task_id;
ConvDwInt8(output_ptr_, buffer, input_ptr_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
task_id);
return RET_OK;
}
......@@ -138,6 +116,16 @@ int ConvDwInt8Run(void *cdata, int task_id) {
return RET_OK;
}
int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
int output_row_size = conv_param_->thread_num_ * conv_param_->output_w_ * conv_param_->output_channel_;
row_buffer_ = reinterpret_cast<int32_t *>(context_->allocator->Malloc(output_row_size * sizeof(float)));
if (row_buffer_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
return RET_OK;
}
int ConvolutionDepthwiseInt8CPUKernel::Run() {
if (conv_param_->input_channel_ != conv_param_->output_channel_) {
MS_LOG(ERROR) << "Only support input channel equals output channel.";
......@@ -156,13 +144,10 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
}
auto input_tensor = in_tensors_.at(kInputIndex);
auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data());
PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
input_ptr_ = reinterpret_cast<int8_t *>(input_tensor->Data());
auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data());
if (!need_align_) {
packed_output_ = output_addr;
}
auto output_tensor = out_tensors_.at(kOutputIndex);
output_ptr_ = reinterpret_cast<int8_t *>(output_tensor->Data());
ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwInt8Run, this, conv_param_->thread_num_);
if (ret != RET_OK) {
......@@ -170,12 +155,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
return RET_ERROR;
}
if (need_align_) {
PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
context_->allocator->Free(packed_output_);
}
context_->allocator->Free(packed_input_);
context_->allocator->Free(row_buffer_);
return RET_OK;
}
......@@ -186,8 +166,14 @@ kernel::LiteKernel *CpuConvDwInt8KernelCreator(const std::vector<lite::tensor::T
const mindspore::lite::PrimitiveC *primitive) {
MS_ASSERT(opParameter != nullptr);
MS_ASSERT(desc.type == schema::PrimitiveType_DepthwiseConv2D);
auto kernel =
new (std::nothrow) kernel::ConvolutionDepthwiseInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
kernel::LiteKernel *kernel;
auto filter_quant_size = inputs[kWeightIndex]->GetQuantParams().size();
if (filter_quant_size == 1) { // per tensor
kernel = new (std::nothrow) kernel::ConvolutionDepthwiseInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
} else { // per channel
kernel =
new (std::nothrow) kernel::ConvolutionDepthwiseSWInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
}
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel is nullptr.";
return nullptr;
......
......@@ -36,15 +36,14 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
int Run() override;
int InitWeightBias();
int InitBuffer();
int Execute(int task_id);
private:
SlidingWindowParam *sliding = nullptr;
int InitBuffer();
int16_t *packed_weight_ = nullptr;
int16_t *packed_input_ = nullptr;
int8_t *packed_output_ = nullptr;
bool need_align_ = false;
int8_t *input_ptr_ = nullptr;
int8_t *output_ptr_ = nullptr;
int32_t *row_buffer_ = nullptr;
};
} // namespace mindspore::kernel
......
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "nnacl/int8/conv_depthwise_int8.h"
#include "src/runtime/runtime_api.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel {
ConvolutionDepthwiseSWInt8CPUKernel::~ConvolutionDepthwiseSWInt8CPUKernel() {
if (sliding != nullptr) {
delete sliding;
sliding = nullptr;
}
if (packed_weight_ != nullptr) {
free(packed_weight_);
packed_weight_ = nullptr;
}
FreeQuantParam();
}
int ConvolutionDepthwiseSWInt8CPUKernel::InitWeightBias() {
// init weight, int8 -> int16
// o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
auto weight_tensor = in_tensors_[kWeightIndex];
auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
if (in_tensors_.size() == kInputSize2) {
auto bias_tensor = in_tensors_.at(kBiasIndex);
auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
}
conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
return RET_OK;
}
int ConvolutionDepthwiseSWInt8CPUKernel::InitBuffer() {
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
UP_DIV(conv_param_->input_channel_, 4);
packed_input_ = reinterpret_cast<int16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(int16_t)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
if (conv_param_->input_channel_ % C4NUM != 0) {
need_align_ = true;
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
UP_DIV(conv_param_->output_channel_, C4NUM);
packed_output_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(pack_output_size * sizeof(int8_t)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
}
return RET_OK;
}
int ConvolutionDepthwiseSWInt8CPUKernel::Init() {
sliding = new (std::nothrow) SlidingWindowParam;
if (sliding == nullptr) {
MS_LOG(ERROR) << "new sliding window param.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int ConvolutionDepthwiseSWInt8CPUKernel::ReSize() {
ConvolutionBaseCPUKernel::Init();
InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Set quant param failed.";
return ret;
}
ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!";
return ret;
}
return RET_OK;
}
int ConvolutionDepthwiseSWInt8CPUKernel::Execute(int task_id) {
ConvDwSWInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
sliding, task_id);
return RET_OK;
}
int ConvDwSWInt8Run(void *cdata, int task_id) {
auto conv_dw_int8 = reinterpret_cast<ConvolutionDepthwiseSWInt8CPUKernel *>(cdata);
auto ret = conv_dw_int8->Execute(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionDepthwiseSWInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR;
}
return RET_OK;
}
int ConvolutionDepthwiseSWInt8CPUKernel::Run() {
if (conv_param_->input_channel_ != conv_param_->output_channel_) {
MS_LOG(ERROR) << "Only support input channel equals output channel.";
return RET_ERROR;
}
auto ret = Prepare();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Prepare failed.";
return RET_ERROR;
}
ret = InitBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
return ret;
}
auto input_tensor = in_tensors_.at(kInputIndex);
auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data());
PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data());
if (!need_align_) {
packed_output_ = output_addr;
}
ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwSWInt8Run, this, conv_param_->thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvDwSWInt8Run error: error_code[" << ret << "]";
return RET_ERROR;
}
if (need_align_) {
PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
context_->allocator->Free(packed_output_);
}
context_->allocator->Free(packed_input_);
return RET_OK;
}
} // namespace mindspore::kernel
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "nnacl/fp32/conv_depthwise.h"
namespace mindspore::kernel {
class ConvolutionDepthwiseSWInt8CPUKernel : public ConvolutionBaseCPUKernel {
public:
ConvolutionDepthwiseSWInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDepthwiseSWInt8CPUKernel() override;
int Init() override;
int ReSize() override;
int Run() override;
int InitWeightBias();
int InitBuffer();
int Execute(int task_id);
private:
SlidingWindowParam *sliding = nullptr;
int16_t *packed_weight_ = nullptr;
int16_t *packed_input_ = nullptr;
int8_t *packed_output_ = nullptr;
bool need_align_ = false;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册