diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S index 8323178a2c95a65e19145c24f40d8a58056a1f41..3ca68cd60e1332fbdfb447656351fcc8b5ca9459 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S @@ -29,7 +29,7 @@ mov x6, x1 mov x7, x2 mov x8, x4 - LoopInputDepth16In: + LoopDepth16In: cmp x8, #16 blt L4 sub x8, x8, #16 @@ -39,8 +39,8 @@ mov x8, x4 ld1 {v16.4s, v17.4s}, [x0], #32 cmp x8, #16 - blt LoopInputDepth16Out - LoopInputDepth16: + blt LoopDepth16Out + LoopDepth16: fmla v16.4s, v0.4s, v2.4s fmla v17.4s, v1.4s, v3.4s @@ -61,9 +61,9 @@ mov x8, x4 sub x8, x8, #16 cmp x8, #16 - bge LoopInputDepth16 + bge LoopDepth16 - LoopInputDepth16Out: + LoopDepth16Out: fmla v16.4s, v0.4s, v2.4s fmla v17.4s, v1.4s, v3.4s st1 {v16.4s, v17.4s}, [x9], #32 @@ -81,7 +81,7 @@ mov x8, x4 cmp x8, #4 blt L0 - LoopInputDepth4: + LoopDepth4: ld1 {v0.4s}, [x6], #16 ld1 {v2.4s}, [x7], #16 ld1 {v16.4s}, [x0], #16 @@ -89,13 +89,13 @@ mov x8, x4 st1 {v16.4s}, [x9], #16 sub x8, x8, #4 cmp x8, #4 - bge LoopInputDepth4 + bge LoopDepth4 L0: cmp x8, #0 beq Loop16LineEnd - LoopInputDepth0: + LoopDepth0: ldr s0, [x6], #4 ldr s1, [x7], #4 ldr s2, [x0], #4 @@ -103,7 +103,7 @@ mov x8, x4 fadd s2, s2, s0 str s2, [x9], #4 subs x8, x8, #1 - bne LoopInputDepth0 + bne LoopDepth0 Loop16LineEnd: diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S new file mode 100644 index 0000000000000000000000000000000000000000..de74f339eb0badb7461899f38d2d7d2e7b6280d2 --- /dev/null +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S @@ -0,0 +1,169 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global ConvDwInt8PostAlign4 +#ifndef __APPLE__ +.type ConvDwInt8PostAlign4, %function +#endif + +// void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier, +// int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max); +// x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier, +// x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max + +ConvDwInt8PostAlign4: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + ldr x8, [sp] + + dup v26.4s, w5 + dup v27.4s, w4 + dup v28.4s, w6 + + dup v29.4s, w3 + dup v30.4s, w7 + dup v31.4s, w8 + + cmp x2, 16 + blt LoopDepth8 + + LoopDepth16: + ld1 {v0.4s}, [x1], #16 + ld1 {v1.4s}, [x1], #16 + ld1 {v2.4s}, [x1], #16 + ld1 {v3.4s}, [x1], #16 + + sqshl v0.4s, v0.4s, v26.4s + sqshl v1.4s, v1.4s, v26.4s + sqshl v2.4s, v2.4s, v26.4s + sqshl v3.4s, v3.4s, v26.4s + + sqrdmulh v0.4s, v0.4s, v27.4s + sqrdmulh v1.4s, v1.4s, v27.4s + sqrdmulh v2.4s, v2.4s, v27.4s + sqrdmulh v3.4s, v3.4s, v27.4s + + and v16.16b, v28.16b, v0.16b + sshr v16.4s, v16.4s, #31 + sqadd v0.4s, v0.4s, v16.4s + srshl v0.4s, v0.4s, v28.4s + and v17.16b, v28.16b, v1.16b + sshr v17.4s, v17.4s, #31 + sqadd v1.4s, v1.4s, v17.4s + srshl v1.4s, v1.4s, v28.4s + and v18.16b, v28.16b, v2.16b + sshr v18.4s, v18.4s, #31 + sqadd v2.4s, v2.4s, v18.4s + srshl v2.4s, v2.4s, v28.4s + and v19.16b, v28.16b, v3.16b + sshr v19.4s, v19.4s, #31 + sqadd v3.4s, v3.4s, v19.4s + srshl v3.4s, v3.4s, v28.4s + + add v0.4s, v0.4s, v29.4s + add v1.4s, v1.4s, v29.4s + add v2.4s, v2.4s, v29.4s + add v3.4s, v3.4s, v29.4s + + smax v0.4s, v0.4s, v30.4s + smax v1.4s, v1.4s, v30.4s + smax v2.4s, v2.4s, v30.4s + smax v3.4s, v3.4s, v30.4s + + smin v0.4s, v0.4s, v31.4s + smin v1.4s, v1.4s, v31.4s + smin v2.4s, v2.4s, v31.4s + smin v3.4s, v3.4s, v31.4s + + sqxtn v0.4h, v0.4s + sqxtn v1.4h, v1.4s + sqxtn v2.4h, v2.4s + sqxtn v3.4h, v3.4s + + sqxtn v0.8b, v0.8h + sqxtn v1.8b, v1.8h + sqxtn v2.8b, v2.8h + sqxtn v3.8b, v3.8h + + st1 {v0.s}[0], [x0], #4 + st1 {v1.s}[0], [x0], #4 + st1 {v2.s}[0], [x0], #4 + st1 {v3.s}[0], [x0], #4 + + sub x2, x2, #16 + cmp x2, #16 + bge LoopDepth16 + + LoopDepth8: + cmp x2, #8 + blt LoopDepth4 + ld1 {v0.4s}, [x1], #16 + ld1 {v1.4s}, [x1], #16 + + sqshl v0.4s, v0.4s, v26.4s + sqshl v1.4s, v1.4s, v26.4s + + sqrdmulh v0.4s, v0.4s, v27.4s + sqrdmulh v1.4s, v1.4s, v27.4s + + and v16.16b, v28.16b, v0.16b + sshr v16.4s, v16.4s, #31 + sqadd v0.4s, v0.4s, v16.4s + srshl v0.4s, v0.4s, v28.4s + and v17.16b, v28.16b, v1.16b + sshr v17.4s, v17.4s, #31 + sqadd v1.4s, v1.4s, v17.4s + srshl v1.4s, v1.4s, v28.4s + + add v0.4s, v0.4s, v29.4s + add v1.4s, v1.4s, v29.4s + + smax v0.4s, v0.4s, v30.4s + smax v1.4s, v1.4s, v30.4s + + smin v0.4s, v0.4s, v31.4s + smin v1.4s, v1.4s, v31.4s + + sqxtn v0.4h, v0.4s + sqxtn v1.4h, v1.4s + + sqxtn v0.8b, v0.8h + sqxtn v1.8b, v1.8h + + st1 {v0.s}[0], [x0], #4 + st1 {v1.s}[0], [x0], #4 + + sub x2, x2, #8 + cmp x2, #8 + bge LoopDepth8 + + LoopDepth4: + cmp x2, #4 + blt End + ld1 {v0.4s}, [x1], #16 + + sqshl v0.4s, v0.4s, v26.4s + sqrdmulh v0.4s, v0.4s, v27.4s + + and v16.16b, v28.16b, v0.16b + sshr v16.4s, v16.4s, #31 + sqadd v0.4s, v0.4s, v16.4s + srshl v0.4s, v0.4s, v28.4s + + add v0.4s, v0.4s, v29.4s + smax v0.4s, v0.4s, v30.4s + smin v0.4s, v0.4s, v31.4s + + sqxtn v0.4h, v0.4s + sqxtn v0.8b, v0.8h + + st1 {v0.s}[0], [x0], #4 + + sub x2, x2, #4 + bge LoopDepth4 + End: + ret +#endif diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S new file mode 100644 index 0000000000000000000000000000000000000000..7d32ef9bae4e69a1d4c8cd83665373f609327079 --- /dev/null +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S @@ -0,0 +1,122 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global ConvDwInt8Row +#ifndef __APPLE__ +.type ConvDwInt8Row, %function +#endif + +// void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels, +// int output_channel, int input_step, int8_t input_zp) +// x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels, +// x4: output_channel, x5: input_step, x6: input_zp +// +ConvDwInt8Row: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters +cmp x3, #0 +beq End + +mov x10, x0 + +dup v31.8b, w6 + +LoopOutPixel: +mov x7, x1 +mov x8, x2 +mov x9, x4 + + LoopDepth16In: + cmp x9, #16 + blt L8 + sub x9, x9, #16 + + ld1 {v0.8b, v1.8b}, [x7], #16 + ld1 {v2.8h, v3.8h}, [x8], #32 + ld1 {v16.4s, v17.4s}, [x0], #32 + + ssubl v20.8h, v0.8b, v31.8b + smlal v16.4s, v20.4h, v2.4h + smlal2 v17.4s, v20.8h, v2.8h + + + cmp x9, #16 + blt LoopDepth16Out + LoopDepth16: + + st1 {v16.4s, v17.4s}, [x10], #32 + ld1 {v18.4s, v19.4s}, [x0], #32 + ssubl v21.8h, v1.8b, v31.8b + smlal v18.4s, v21.4h, v3.4h + smlal2 v19.4s, v21.8h, v3.8h + st1 {v18.4s, v19.4s}, [x10], #32 + + ld1 {v0.8b, v1.8b}, [x7], #16 + ld1 {v2.8h, v3.8h}, [x8], #32 + ld1 {v16.4s, v17.4s}, [x0], #32 + + ssubl v20.8h, v0.8b, v31.8b + smlal v16.4s, v20.4h, v2.4h + smlal2 v17.4s, v20.8h, v2.8h + + sub x9, x9, #16 + cmp x9, #16 + bge LoopDepth16 + + LoopDepth16Out: + + st1 {v16.4s, v17.4s}, [x10], #32 + ld1 {v18.4s, v19.4s}, [x0], #32 + ssubl v21.8h, v1.8b, v31.8b + smlal v18.4s, v21.4h, v3.4h + smlal2 v19.4s, v21.8h, v3.8h + st1 {v18.4s, v19.4s}, [x10], #32 + + L8: + cmp x9, #8 + blt L0 + + LoopDepth8: + ld1 {v0.8b}, [x7], #8 + ld1 {v2.8h}, [x8], #16 + ld1 {v16.4s, v17.4s}, [x0], #32 + + ssubl v20.8h, v0.8b, v31.8b + smlal v16.4s, v20.4h, v2.4h + smlal2 v17.4s, v20.8h, v2.8h + st1 {v16.4s, v17.4s}, [x10], #32 + + sub x9, x9, #8 + cmp x9, #8 + bge LoopDepth8 + + L0: + cmp x9, #0 + beq Loop16LineEnd + + LoopDepth0: + ldrsb w14, [x7], #1 + ldrsh w15, [x8], #2 + ldr w16, [x0], #4 + add w14, w14, w6 + + sxth w14, w14 + madd w14, w14, w15, w16 + str w14, [x10], #4 + + subs x9, x9, #1 + bne LoopDepth0 + + Loop16LineEnd: + +subs x3, x3, #1 +add x1, x1, x5 +bne LoopOutPixel + +End: +ret + +#endif diff --git a/mindspore/lite/nnacl/int8/common_func.h b/mindspore/lite/nnacl/int8/common_func.h index bc8b35a0b85b78087918450d68f71743c87008d3..1e1b965d34d3482d57815231306fd66113c2da45 100644 --- a/mindspore/lite/nnacl/int8/common_func.h +++ b/mindspore/lite/nnacl/int8/common_func.h @@ -49,6 +49,10 @@ void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, co size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); +void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels, + int output_channel, int input_step, int8_t input_zp); +void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier, + int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max); #endif #ifdef __cplusplus diff --git a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c index 817d549a39b51549f02fb605f1edebba3d10d08c..7e7d9d4067c1e008ff797ecb72d54725b1872aa5 100644 --- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c +++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c @@ -20,6 +20,99 @@ #include "nnacl/int8/common_func.h" /*conv depthwise int8 begin*/ +// only support perlayer +#ifndef ENABLE_ARM64 +void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels, + int output_channel, int input_step, int8_t input_zp) { + for (int i = 0; i < num_pixels; i++) { + for (int c = 0; c < output_channel; c++) { + const int16_t input = input_ptr[c] - input_zp; + *output_ptr++ += input * weight_ptr[c]; + } + input_ptr += input_step; + } +} +#endif + +void ConvDwInt8Post(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier, + int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max) { + int align_num = 0; +#ifdef ENABLE_ARM64 + align_num = num_pixels / 4 * 4; + ConvDwInt8PostAlign4(dst, buffer, align_num, output_zp, out_multiplier, left_shift, right_shift, acc_min, acc_max); +#endif + for (int i = align_num; i < num_pixels; i++) { + buffer[i] = RoundingDivideByPOT( + SaturatingRoundingDoublingHighMul(buffer[i] * (1 << (unsigned int)left_shift), out_multiplier), -right_shift); + buffer[i] += output_zp; + buffer[i] = MSMAX(buffer[i], acc_min); + buffer[i] = MSMIN(buffer[i], acc_max); + dst[i] = (buffer[i]); + } +} + +void ConvDwInt8(int8_t *output_data, int32_t *row_buffer, const int8_t *input_data, const int16_t *weight_data, + const int32_t *bias_data, const ConvParameter *conv_param, int task_id) { + int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_); + int h_start = h_step * task_id; + int h_end = MSMIN(h_start + h_step, conv_param->output_h_); + + int out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_[0]; + int left_shift = conv_param->conv_quant_arg_.left_shift_[0]; + int right_shift = conv_param->conv_quant_arg_.right_shift_[0]; + + int intput_zp = conv_param->conv_quant_arg_.input_quant_args_[0].zp_; + int output_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_; + int acc_min = conv_param->conv_quant_arg_.out_act_min_[0]; + int acc_max = conv_param->conv_quant_arg_.out_act_max_[0]; + + for (int b = 0; b < conv_param->output_batch_; b++) { + const int8_t *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_; + int8_t *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_; + for (int oh = h_start; oh < h_end; oh++) { + int8_t *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_; + + int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_; + int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_)); + int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_)); + + // init acc + for (int ow = 0; ow < conv_param->output_w_; ow++) { + memcpy(row_buffer + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(int32_t)); + } + for (int kh = start_kh; kh < end_kh; kh++) { + int ih = ih_origin + conv_param->dilation_w_ * kh; + + const int8_t *src_kh = src + ih * conv_param->input_w_ * conv_param->input_channel_; + const int16_t *weight_kh = weight_data + kh * conv_param->kernel_w_ * conv_param->output_channel_; + + int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_; + for (int kw = 0; kw < conv_param->kernel_w_; kw++) { + int out_w_start = MSMAX( + 0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_); + int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ - + conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / + conv_param->stride_w_); + + int32_t *acc_w = row_buffer + out_w_start * conv_param->output_channel_; + int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw; + + const int8_t *src_kw = src_kh + iw_origin * conv_param->input_channel_; + int num_pixels = out_w_end - out_w_start; + + ConvDwInt8Row(acc_w, src_kw, weight_kh, num_pixels, conv_param->output_channel_, in_sw_step, intput_zp); + weight_kh += conv_param->output_channel_; + } + } + // post func, acc int32 -> dst int8 + ConvDwInt8Post(dst_data, row_buffer, conv_param->output_w_ * conv_param->output_channel_, output_zp, + out_multiplier, left_shift, right_shift, acc_min, acc_max); + } + } +} +/*conv depthwise int8 end*/ + +/*conv depthwise sliding window int8 begin*/ void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height, int width, int in_kh_step, int in_kw_step, int kernel_w, int *out_multiplier, int *left_shift, int *right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max, @@ -153,8 +246,8 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight, } #endif -void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data, - const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) { +void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data, + const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) { const int16_t *src = input_data; int8_t *dst = output_data; bool per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL; @@ -215,7 +308,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w } // batch loop // output nhwc4 } -/*conv depthwise int8 end*/ +/*conv depthwise sliding window int8 end*/ /*deconv depthwise int8 begin*/ void DeconvDepthwiseBorderPixelInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width, diff --git a/mindspore/lite/nnacl/int8/conv_depthwise_int8.h b/mindspore/lite/nnacl/int8/conv_depthwise_int8.h index 19a4ad0fd865e366a8c080fdd4f195ed76569f0d..004b9dff27bf617156158358b10a9fedcd2607a7 100644 --- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.h +++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.h @@ -23,8 +23,12 @@ #ifdef __cplusplus extern "C" { #endif -void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data, - const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id); + +void ConvDwInt8(int8_t *output_data, int32_t *output_row, const int8_t *input_data, const int16_t *weight_data, + const int32_t *bias_data, const ConvParameter *conv_param, int task_id); + +void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data, + const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id); void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc index 3b8bfa935a507dcf74d9c7585f1210277fd4edd9..a1e47bd2a48dc12e53365dbdfe94b0f263d55c1f 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc @@ -15,6 +15,7 @@ */ #include "src/runtime/kernel/arm/int8/convolution_depthwise_int8.h" +#include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h" #include "schema/model_generated.h" #include "src/kernel_registry.h" #include "include/errorcode.h" @@ -29,10 +30,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D; namespace mindspore::kernel { ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { - if (sliding != nullptr) { - delete sliding; - sliding = nullptr; - } if (packed_weight_ != nullptr) { free(packed_weight_); packed_weight_ = nullptr; @@ -42,63 +39,44 @@ ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { // init weight, int8 -> int16 - // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1 auto weight_tensor = in_tensors_[kWeightIndex]; auto origin_weight = reinterpret_cast(weight_tensor->Data()); - int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM); - int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width(); + int channel = weight_tensor->Batch(); + int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width(); + auto tmp_weight = reinterpret_cast(malloc(pack_weight_size * sizeof(int8_t))); + if (tmp_weight == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + PackNCHWToNHWCInt8(origin_weight, tmp_weight, 1, weight_tensor->Height() * weight_tensor->Width(), + weight_tensor->Batch()); + + int weight_zp = conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_; packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(int16_t))); if (packed_weight_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(), - weight_tensor->Batch(), &(conv_param_->conv_quant_arg_)); + for (int i = 0; i < weight_tensor->ElementsNum(); i++) { + packed_weight_[i] = (int16_t)(tmp_weight[i] - weight_zp); + } - bias_data_ = reinterpret_cast(malloc(C4NUM * OC4 * sizeof(int32_t))); + bias_data_ = reinterpret_cast(malloc(channel * sizeof(int32_t))); if (bias_data_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); + memset(bias_data_, 0, channel * sizeof(int32_t)); if (in_tensors_.size() == kInputSize2) { auto bias_tensor = in_tensors_.at(kBiasIndex); auto ori_bias = reinterpret_cast(bias_tensor->Data()); memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t)); } - conv_param_->thread_num_ = MSMIN(thread_count_, OC4); - return RET_OK; -} - -int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() { - int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * - UP_DIV(conv_param_->input_channel_, 4); - packed_input_ = reinterpret_cast(context_->allocator->Malloc(pack_input_size * sizeof(int16_t))); - if (packed_input_ == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed."; - return RET_ERROR; - } - - if (conv_param_->input_channel_ % C4NUM != 0) { - need_align_ = true; - int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * - UP_DIV(conv_param_->output_channel_, C4NUM); - packed_output_ = reinterpret_cast(context_->allocator->Malloc(pack_output_size * sizeof(int8_t))); - if (packed_input_ == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed."; - return RET_ERROR; - } - } return RET_OK; } int ConvolutionDepthwiseInt8CPUKernel::Init() { - sliding = new (std::nothrow) SlidingWindowParam; - if (sliding == nullptr) { - MS_LOG(ERROR) << "new sliding window param."; - return RET_ERROR; - } if (!InferShapeDone()) { return RET_OK; } @@ -107,13 +85,12 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() { int ConvolutionDepthwiseInt8CPUKernel::ReSize() { ConvolutionBaseCPUKernel::Init(); - InitSlidingParamConvDw(sliding, conv_param_, C4NUM); - auto ret = ConvolutionBaseCPUKernel::SetQuantParam(); if (ret != RET_OK) { MS_LOG(ERROR) << "Set quant param failed."; return ret; } + conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_); ret = InitWeightBias(); if (ret != RET_OK) { MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!"; @@ -123,8 +100,9 @@ int ConvolutionDepthwiseInt8CPUKernel::ReSize() { } int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) { - ConvDwInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, - sliding, task_id); + auto buffer = row_buffer_ + conv_param_->output_w_ * conv_param_->output_channel_ * task_id; + ConvDwInt8(output_ptr_, buffer, input_ptr_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, + task_id); return RET_OK; } @@ -138,6 +116,16 @@ int ConvDwInt8Run(void *cdata, int task_id) { return RET_OK; } +int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() { + int output_row_size = conv_param_->thread_num_ * conv_param_->output_w_ * conv_param_->output_channel_; + row_buffer_ = reinterpret_cast(context_->allocator->Malloc(output_row_size * sizeof(float))); + if (row_buffer_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + return RET_OK; +} + int ConvolutionDepthwiseInt8CPUKernel::Run() { if (conv_param_->input_channel_ != conv_param_->output_channel_) { MS_LOG(ERROR) << "Only support input channel equals output channel."; @@ -156,13 +144,10 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() { } auto input_tensor = in_tensors_.at(kInputIndex); - auto input_addr = reinterpret_cast(input_tensor->Data()); - PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); + input_ptr_ = reinterpret_cast(input_tensor->Data()); - auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); - if (!need_align_) { - packed_output_ = output_addr; - } + auto output_tensor = out_tensors_.at(kOutputIndex); + output_ptr_ = reinterpret_cast(output_tensor->Data()); ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwInt8Run, this, conv_param_->thread_num_); if (ret != RET_OK) { @@ -170,12 +155,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() { return RET_ERROR; } - if (need_align_) { - PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_, - conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); - context_->allocator->Free(packed_output_); - } - context_->allocator->Free(packed_input_); + context_->allocator->Free(row_buffer_); return RET_OK; } @@ -186,8 +166,14 @@ kernel::LiteKernel *CpuConvDwInt8KernelCreator(const std::vectorGetQuantParams().size(); + if (filter_quant_size == 1) { // per tensor + kernel = new (std::nothrow) kernel::ConvolutionDepthwiseInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive); + } else { // per channel + kernel = + new (std::nothrow) kernel::ConvolutionDepthwiseSWInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive); + } if (kernel == nullptr) { MS_LOG(ERROR) << "kernel is nullptr."; return nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h index 57d7beac79cdd1b09c97e1b8c712e6a179d1eb00..b8661236bc9edb2f88648ac2fcf9aa1e50dfffe0 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h @@ -36,15 +36,14 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel { int Run() override; int InitWeightBias(); - int InitBuffer(); int Execute(int task_id); private: - SlidingWindowParam *sliding = nullptr; + int InitBuffer(); int16_t *packed_weight_ = nullptr; - int16_t *packed_input_ = nullptr; - int8_t *packed_output_ = nullptr; - bool need_align_ = false; + int8_t *input_ptr_ = nullptr; + int8_t *output_ptr_ = nullptr; + int32_t *row_buffer_ = nullptr; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc new file mode 100644 index 0000000000000000000000000000000000000000..3e4d3274b904d766c559bad45f35413ea5c98e99 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc @@ -0,0 +1,182 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "include/errorcode.h" +#include "nnacl/int8/conv_depthwise_int8.h" +#include "src/runtime/runtime_api.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_DepthwiseConv2D; + +namespace mindspore::kernel { +ConvolutionDepthwiseSWInt8CPUKernel::~ConvolutionDepthwiseSWInt8CPUKernel() { + if (sliding != nullptr) { + delete sliding; + sliding = nullptr; + } + if (packed_weight_ != nullptr) { + free(packed_weight_); + packed_weight_ = nullptr; + } + FreeQuantParam(); +} + +int ConvolutionDepthwiseSWInt8CPUKernel::InitWeightBias() { + // init weight, int8 -> int16 + // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1 + auto weight_tensor = in_tensors_[kWeightIndex]; + auto origin_weight = reinterpret_cast(weight_tensor->Data()); + int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM); + int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width(); + packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(int16_t))); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(), + weight_tensor->Batch(), &(conv_param_->conv_quant_arg_)); + + bias_data_ = reinterpret_cast(malloc(C4NUM * OC4 * sizeof(int32_t))); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); + if (in_tensors_.size() == kInputSize2) { + auto bias_tensor = in_tensors_.at(kBiasIndex); + auto ori_bias = reinterpret_cast(bias_tensor->Data()); + memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t)); + } + + conv_param_->thread_num_ = MSMIN(thread_count_, OC4); + return RET_OK; +} + +int ConvolutionDepthwiseSWInt8CPUKernel::InitBuffer() { + int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * + UP_DIV(conv_param_->input_channel_, 4); + packed_input_ = reinterpret_cast(context_->allocator->Malloc(pack_input_size * sizeof(int16_t))); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + + if (conv_param_->input_channel_ % C4NUM != 0) { + need_align_ = true; + int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * + UP_DIV(conv_param_->output_channel_, C4NUM); + packed_output_ = reinterpret_cast(context_->allocator->Malloc(pack_output_size * sizeof(int8_t))); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + } + return RET_OK; +} + +int ConvolutionDepthwiseSWInt8CPUKernel::Init() { + sliding = new (std::nothrow) SlidingWindowParam; + if (sliding == nullptr) { + MS_LOG(ERROR) << "new sliding window param."; + return RET_ERROR; + } + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int ConvolutionDepthwiseSWInt8CPUKernel::ReSize() { + ConvolutionBaseCPUKernel::Init(); + InitSlidingParamConvDw(sliding, conv_param_, C4NUM); + + auto ret = ConvolutionBaseCPUKernel::SetQuantParam(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Set quant param failed."; + return ret; + } + ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!"; + return ret; + } + return RET_OK; +} + +int ConvolutionDepthwiseSWInt8CPUKernel::Execute(int task_id) { + ConvDwSWInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, + sliding, task_id); + return RET_OK; +} + +int ConvDwSWInt8Run(void *cdata, int task_id) { + auto conv_dw_int8 = reinterpret_cast(cdata); + auto ret = conv_dw_int8->Execute(task_id); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvolutionDepthwiseSWInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionDepthwiseSWInt8CPUKernel::Run() { + if (conv_param_->input_channel_ != conv_param_->output_channel_) { + MS_LOG(ERROR) << "Only support input channel equals output channel."; + return RET_ERROR; + } + auto ret = Prepare(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Prepare failed."; + return RET_ERROR; + } + + ret = InitBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Depthwise int8 ReSize error!"; + return ret; + } + + auto input_tensor = in_tensors_.at(kInputIndex); + auto input_addr = reinterpret_cast(input_tensor->Data()); + PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); + + auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); + if (!need_align_) { + packed_output_ = output_addr; + } + + ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwSWInt8Run, this, conv_param_->thread_num_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ConvDwSWInt8Run error: error_code[" << ret << "]"; + return RET_ERROR; + } + + if (need_align_) { + PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + context_->allocator->Free(packed_output_); + } + context_->allocator->Free(packed_input_); + return RET_OK; +} + +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h new file mode 100644 index 0000000000000000000000000000000000000000..4c373c24661982ee0868b39ae59326e482ed6808 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h @@ -0,0 +1,51 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_ + +#include +#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "nnacl/fp32/conv_depthwise.h" + +namespace mindspore::kernel { +class ConvolutionDepthwiseSWInt8CPUKernel : public ConvolutionBaseCPUKernel { + public: + ConvolutionDepthwiseSWInt8CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const Context *ctx, + const mindspore::lite::PrimitiveC *primitive) + : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~ConvolutionDepthwiseSWInt8CPUKernel() override; + + int Init() override; + int ReSize() override; + int Run() override; + + int InitWeightBias(); + int InitBuffer(); + int Execute(int task_id); + + private: + SlidingWindowParam *sliding = nullptr; + int16_t *packed_weight_ = nullptr; + int16_t *packed_input_ = nullptr; + int8_t *packed_output_ = nullptr; + bool need_align_ = false; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_