From 6cfcdaab3db7d7c79b83fed767b923f15c33ff03 Mon Sep 17 00:00:00 2001 From: ling Date: Fri, 21 Aug 2020 10:05:32 +0800 Subject: [PATCH] [MS][LITE][Develop]conv1x1 int8 --- mindspore/lite/nnacl/int8/conv_int8.c | 20 ++ mindspore/lite/nnacl/int8/conv_int8.h | 7 + mindspore/lite/nnacl/int8/deconv.c | 68 +---- mindspore/lite/nnacl/int8/matmul_int8.c | 47 ++- mindspore/lite/nnacl/int8/matmul_int8.h | 7 +- mindspore/lite/nnacl/matmul_parameter.h | 5 + mindspore/lite/nnacl/opt_op_handler.c | 8 + mindspore/lite/nnacl/pack.c | 111 ++++++- mindspore/lite/nnacl/pack.h | 7 +- .../kernel/arm/base/convolution_base.cc | 13 +- .../kernel/arm/fp32/convolution_1x1.cc | 2 +- .../kernel/arm/int8/convolution_1x1_int8.cc | 270 +++++++++++++++++ .../kernel/arm/int8/convolution_1x1_int8.h | 68 +++++ .../kernel/arm/int8/convolution_int8.cc | 4 + .../kernel/arm/fp32/conv1x1_fp32_tests.cc | 8 +- .../kernel/arm/int8/conv_1x1_int8_tests.cc | 281 ++++++++++++++++++ 16 files changed, 839 insertions(+), 87 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc diff --git a/mindspore/lite/nnacl/int8/conv_int8.c b/mindspore/lite/nnacl/int8/conv_int8.c index cbee7f19c..885577bd8 100644 --- a/mindspore/lite/nnacl/int8/conv_int8.c +++ b/mindspore/lite/nnacl/int8/conv_int8.c @@ -367,6 +367,26 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight } } +void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, + const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param, + MATMUL_OPT_R_FUNC matmul_func) { + if (matmul_func != NULL) { + matmul_func(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias, + conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_, + conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.output_quant_args_[0].zp_, + conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], + (conv_param->conv_quant_arg_.filter_arg_num_ > 1)); + } else { + MatMulInt8_16x4_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias, + conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_, + conv_param->conv_quant_arg_.quant_multiplier_, + conv_param->conv_quant_arg_.output_quant_args_[0].zp_, + conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], + (conv_param->conv_quant_arg_.filter_arg_num_ > 1)); + } + return; +} + // int8 convolution 3x3 void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data, int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out, diff --git a/mindspore/lite/nnacl/int8/conv_int8.h b/mindspore/lite/nnacl/int8/conv_int8.h index 730b031ce..101978953 100644 --- a/mindspore/lite/nnacl/int8/conv_int8.h +++ b/mindspore/lite/nnacl/int8/conv_int8.h @@ -25,6 +25,8 @@ #include "nnacl/conv_parameter.h" #include "nnacl/winograd_utils.h" #include "nnacl/quantization/quantize.h" +#include "nnacl/matmul_parameter.h" +#include "nnacl/int8/matmul_int8.h" typedef void (*GEMM_FUNC)(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias, size_t ksize, size_t ic4, size_t output_channel, size_t offset, const int32_t *input_sum, size_t act_min, @@ -51,6 +53,11 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id, ConvParameter *conv_param, GEMM_FUNC gemm_func); +// int8 convolution 1x1 +void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, + const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param, + MATMUL_OPT_R_FUNC matmul_func); + // int8 convolution 3x3 void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data, int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out, diff --git a/mindspore/lite/nnacl/int8/deconv.c b/mindspore/lite/nnacl/int8/deconv.c index b1389f195..a00aed842 100644 --- a/mindspore/lite/nnacl/int8/deconv.c +++ b/mindspore/lite/nnacl/int8/deconv.c @@ -172,73 +172,7 @@ void DeConvPackWeightSum(int8_t *weight, int32_t *weight_sum, int32_t input_zp, void DeConvPackInputSum(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16, bool suppport_opt) { /* optimize normal -> same layout */ -#ifdef ENABLE_ARM64 - asm volatile( - "mov x10, %[src] \n" - "mov x11, %[dst] \n" - "dup v15.4s, %w[filter_zp] \n" - - "mov x0, #0 \n" - "1: \n" - "cmp x0, %[row4] \n" - "beq 4f \n" - "add x0, x0, #4\n" - "dup v10.4s, wzr \n" - "mov x2, #0 \n" - - "2: \n" - "cmp x2, %[col16] \n" - "beq 3f \n" - "add x2, x2, #16\n" - - "ld1 {v0.16b}, [x10], #16\n" - "ld1 {v1.16b}, [x10], #16\n" - "ld1 {v2.16b}, [x10], #16\n" - "ld1 {v3.16b}, [x10], #16\n" - - "saddlp v4.8h, v0.16b \n" - "saddlp v5.8h, v1.16b \n" - "saddlp v6.8h, v2.16b \n" - "saddlp v7.8h, v3.16b \n" - - "saddlp v0.4S, v4.8h \n" - "saddlp v1.4S, v5.8h \n" - "saddlp v2.4S, v6.8h \n" - "saddlp v3.4S, v7.8h \n" - - "addv s4, v0.4S \n" - "addv s5, v1.4S \n" - "addv s6, v2.4S \n" - "addv s7, v3.4S \n" - - "mov v0.s[0], v4.s[0] \n" - "mov v0.s[1], v5.s[0] \n" - "mov v0.s[2], v6.s[0] \n" - "mov v0.s[3], v7.s[0] \n" - - "add v10.4s, v10.4s, v0.4s \n" - "b 2b\n" - - "3: \n" - "mul v10.4s, v10.4s, v15.4s \n" - "st1 {v10.4s}, [x11], #16 \n" - "beq 1b \n" - - "4: \n" - - : - : [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp) - : "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15"); -#else - for (int r = 0; r < row4; r++) { - int32_t tmp_value = 0; - for (int c = 0; c < col16; c++) { - int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM; - int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod; - tmp_value += src[src_index]; - } - } -#endif + PackInputSum16x4PerLater(src, dst, filter_zp, row4, col16); return; } diff --git a/mindspore/lite/nnacl/int8/matmul_int8.c b/mindspore/lite/nnacl/int8/matmul_int8.c index aa93dacf8..467b03bfd 100644 --- a/mindspore/lite/nnacl/int8/matmul_int8.c +++ b/mindspore/lite/nnacl/int8/matmul_int8.c @@ -28,6 +28,19 @@ void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) } } +void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) { + int col16 = UP_ROUND(col, C16NUM); + for (int r = 0; r < row; r++) { + int rd4 = r / C4NUM; + int rm4 = r % C4NUM; + for (int c = 0; c < col; c++) { + int cd16 = c / C16NUM; + int cm16 = c % C16NUM; + dst_ptr[cd16 * col16 * C4NUM + rd4 * C4NUM * C16NUM + rm4 * C16NUM + cm16] = src_ptr[r * col16 + c]; + } + } +} + void MatrixPack4x16UnitInt8(int8_t *src, int8_t *dst, int row, int col, int stride) { for (int r = 0; r < row; r++) { int8_t *src_r = src + r * stride; @@ -145,7 +158,38 @@ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int return; } -#ifdef ENABLE_ARM64 +void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, + size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, + int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, + bool per_channel) { + /* row4x16-major * row16x4-major => (int8)row-major : per-channel */ + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int r4div = r / C4NUM, r4mod = r % C4NUM; + int c4div = c / C4NUM, c4mod = c % C4NUM; + size_t ci = r * stride + c; + int32_t value = 0; + for (int d = 0; d < deep_16; d++) { + int d16div = d / C16NUM, d16mod = d % C16NUM; + size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod; + size_t bi = c4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod; + value = value + a[ai] * b[bi]; + } + int32_t cur_input_sum = per_channel ? input_sum[c4div * UP_ROUND(row, C4NUM) + r * C4NUM + c4mod] : input_sum[r]; + value -= cur_input_sum; + value += bias[c]; + int32_t cur_left_shift = per_channel ? left_shift[c] : left_shift[0]; + int32_t cur_right_shift = per_channel ? right_shift[c] : right_shift[0]; + int32_t cur_multiplier = per_channel ? multiplier[c] : multiplier[0]; + value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp; + value = MSMIN(maxi, value); + value = MSMAX(mini, value); + dst[ci] = (int8_t)value; + } + } + return; +} + void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16) { int stride = sizeof(int8_t) * 16 * 4; for (int r = 0; r < row; ++r) { @@ -201,4 +245,3 @@ void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow) } } } -#endif diff --git a/mindspore/lite/nnacl/int8/matmul_int8.h b/mindspore/lite/nnacl/int8/matmul_int8.h index bf6ab900a..7e7f2e9f9 100644 --- a/mindspore/lite/nnacl/int8/matmul_int8.h +++ b/mindspore/lite/nnacl/int8/matmul_int8.h @@ -28,17 +28,22 @@ void MatMulInt8(const int8_t *a, const int8_t *b, int *c, const int row8, const const int a_zp, const int b_zp); void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16, const int *input_sum, const int *bias); +void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, + size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, + int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, + bool per_channel); void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); +void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col); -#ifdef ENABLE_ARM64 void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16); void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_16); void RowMajor2Asums(int8_t *a, int row, int col, int b_zp, int *dst); void RowMajor2Bbias(int8_t *b, int row, int col, int a_zp, int b_zp, int *bias, int *dst); void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow); +#ifdef ENABLE_ARM64 // bias = bias + depth * a_zp * b_zp - a_zp * b_sums void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, int multiplier, int left_shift, diff --git a/mindspore/lite/nnacl/matmul_parameter.h b/mindspore/lite/nnacl/matmul_parameter.h index 9e290e784..7cad0e05e 100644 --- a/mindspore/lite/nnacl/matmul_parameter.h +++ b/mindspore/lite/nnacl/matmul_parameter.h @@ -22,6 +22,11 @@ typedef void (*MATMUL_OPT_R4_FUNC)(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16, const int *input_sum, const int *bias); +typedef void (*MATMUL_OPT_R_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, + size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, + int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, + int32_t maxi, bool per_channel); + typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col); typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType; diff --git a/mindspore/lite/nnacl/opt_op_handler.c b/mindspore/lite/nnacl/opt_op_handler.c index 14d6309f1..52c7767ee 100644 --- a/mindspore/lite/nnacl/opt_op_handler.c +++ b/mindspore/lite/nnacl/opt_op_handler.c @@ -15,6 +15,7 @@ */ #include +#include #ifdef __cplusplus extern "C" { @@ -45,4 +46,11 @@ void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, i const int *input_sum, const int *bias) { return MatMulOptR4Int8Neon64(a, b, dst, row4, col4, deep16, input_sum, bias); } + +void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, + size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, + int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, + int32_t maxi, bool per_channel) { + return; +} #endif diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c index 327ae0467..5ae56ef40 100644 --- a/mindspore/lite/nnacl/pack.c +++ b/mindspore/lite/nnacl/pack.c @@ -153,22 +153,24 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p } // kernel plane loop } -void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param) { +void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size) { /* support nhwc */ + char *src = (char *)src_ptr; + char *dst = (char *)dst_ptr; for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) { int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_; if (src_h < 0 || src_h >= conv_param->input_h_) { continue; } - const float *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_; - float *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_; + const char *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_ * data_size; + char *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_ * data_size; for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) { int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_; if (src_w < 0 || src_w >= conv_param->input_w_) { continue; } - memcpy(dst_h_ptr + dst_w * conv_param->input_channel_, src_h_ptr + src_w * conv_param->input_channel_, - conv_param->input_channel_ * sizeof(float)); + memcpy(dst_h_ptr + dst_w * conv_param->input_channel_ * data_size, + src_h_ptr + src_w * conv_param->input_channel_ * data_size, conv_param->input_channel_ * data_size); } } return; @@ -188,6 +190,105 @@ void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParam return; } +void PackInputSum16x4PerLater(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16) { + /* optimize normal -> same layout */ +#ifdef ENABLE_ARM64 + asm volatile( + "mov x10, %[src] \n" + "mov x11, %[dst] \n" + "dup v15.4s, %w[filter_zp] \n" + + "mov x0, #0 \n" + "1: \n" + "cmp x0, %[row4] \n" + "beq 4f \n" + "add x0, x0, #4\n" + "dup v10.4s, wzr \n" + "mov x2, #0 \n" + + "2: \n" + "cmp x2, %[col16] \n" + "beq 3f \n" + "add x2, x2, #16\n" + + "ld1 {v0.16b}, [x10], #16\n" + "ld1 {v1.16b}, [x10], #16\n" + "ld1 {v2.16b}, [x10], #16\n" + "ld1 {v3.16b}, [x10], #16\n" + + "saddlp v4.8h, v0.16b \n" + "saddlp v5.8h, v1.16b \n" + "saddlp v6.8h, v2.16b \n" + "saddlp v7.8h, v3.16b \n" + + "saddlp v0.4S, v4.8h \n" + "saddlp v1.4S, v5.8h \n" + "saddlp v2.4S, v6.8h \n" + "saddlp v3.4S, v7.8h \n" + + "addv s4, v0.4S \n" + "addv s5, v1.4S \n" + "addv s6, v2.4S \n" + "addv s7, v3.4S \n" + + "mov v0.s[0], v4.s[0] \n" + "mov v0.s[1], v5.s[0] \n" + "mov v0.s[2], v6.s[0] \n" + "mov v0.s[3], v7.s[0] \n" + + "add v10.4s, v10.4s, v0.4s \n" + "b 2b\n" + + "3: \n" + "mul v10.4s, v10.4s, v15.4s \n" + "st1 {v10.4s}, [x11], #16 \n" + "beq 1b \n" + + "4: \n" + + : + : [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp) + : "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15"); +#else + for (int r = 0; r < row4; r++) { + int32_t tmp_value = 0; + for (int c = 0; c < col16; c++) { + int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM; + int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod; + tmp_value += src[src_index]; + } + dst[r] = tmp_value * filter_zp; + } +#endif + return; +} + +void PackInputSum16x4Int8(int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel, + size_t plane_size, ConvParameter *conv_param) { + size_t hw4 = UP_ROUND(plane_size, C4NUM); + size_t ic16 = UP_ROUND(input_channel, C16NUM); + if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) { + PackInputSum16x4PerLater(input_value, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4, ic16); + } else { + for (int ri = 0; ri < plane_size; ri++) { + int ri4div = ri / C4NUM, ri4mod = ri % C4NUM; + for (int ci = 0; ci < output_channel; ci++) { + int32_t tmp_sum_value = 0; + int ci4div = ci / C4NUM, ci4mod = ci % C4NUM; + int32_t filter_zp = conv_param->conv_quant_arg_.filter_quant_args_[ci].zp_; + for (int di = 0; di < input_channel; di++) { + size_t di16div = di / C16NUM, di16mod = di % C16NUM; + int src_index = ri4div * C4NUM * ic16 + di16div * C16NUM * C4NUM + ri4mod * C16NUM + di16mod; + tmp_sum_value += input_value[src_index]; + } + int dst_index = ci4div * C4NUM * hw4 + ri * C4NUM + ci4mod; + input_sum[dst_index] = tmp_sum_value * filter_zp; + } + } + } + return; +} + void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, float *packed_input, int real_cal_num, int block_index) { // input format : nhwc diff --git a/mindspore/lite/nnacl/pack.h b/mindspore/lite/nnacl/pack.h index 3567732fe..d380bbf5c 100644 --- a/mindspore/lite/nnacl/pack.h +++ b/mindspore/lite/nnacl/pack.h @@ -35,10 +35,15 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index, int32_t *input_sum, ConvParameter *conv_param); -void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param); +void PackInputSum16x4PerLater(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16); + +void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size); void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParameter *conv_param); +void PackInputSum16x4Int8(int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel, + size_t plane_size, ConvParameter *conv_param); + void MatrixPack(const float *src, float *dst, int row, int ic4, int stride); void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param); diff --git a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc index 51a0d0715..deb0e277e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc @@ -118,10 +118,13 @@ int ConvolutionBaseCPUKernel::CheckLayout(lite::tensor::Tensor *input_tensor) { } int ConvolutionBaseCPUKernel::SetIfPerChannel() { + auto filter_tensor = in_tensors_.at(kWeightIndex); + auto input_channel = filter_tensor->Channel(); + auto output_channel = filter_tensor->Batch(); + uint8_t per_channel = 0b0; if (conv_quant_arg_->input_arg_num_ != kPerTensor) { - int in_channel = conv_param_->input_channel_; - if (static_cast(conv_quant_arg_->input_arg_num_) != in_channel) { + if (static_cast(conv_quant_arg_->input_arg_num_) != input_channel) { MS_LOG(ERROR) << "input per channel quant param length is not equal to input channel."; return RET_ERROR; } @@ -129,8 +132,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() { } if (conv_quant_arg_->filter_arg_num_ != kPerTensor) { - int filter_num = conv_param_->output_channel_; - if (static_cast(conv_quant_arg_->filter_arg_num_) != filter_num) { + if (static_cast(conv_quant_arg_->filter_arg_num_) != output_channel) { MS_LOG(ERROR) << "weight per channel quant param length is not equal to filter num."; return RET_ERROR; } @@ -138,8 +140,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() { } if (conv_quant_arg_->output_arg_num_ != kPerTensor) { - int out_channel = conv_param_->output_channel_; - if (static_cast(conv_quant_arg_->output_arg_num_) != out_channel) { + if (static_cast(conv_quant_arg_->output_arg_num_) != output_channel) { MS_LOG(ERROR) << "output per channel quant param length is not equal to output channel."; return RET_ERROR; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc index 30b2b6a1e..768a5460c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc @@ -113,7 +113,7 @@ void Convolution1x1CPUKernel::Pre1x1Trans(float *src_input, float *src_output) { output_ptr_ = src_output; if (pre_trans_input_) { - Conv1x1InputPackFp32(src_input, input_ptr_, conv_param_); + Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(float)); } else { input_ptr_ = src_input; } diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc new file mode 100644 index 000000000..39d32f7a8 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc @@ -0,0 +1,270 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h" +#include "src/runtime/runtime_api.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_MEMORY_FAILED; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { + +Convolution1x1Int8CPUKernel::~Convolution1x1Int8CPUKernel() { + if (matmul_param_ != nullptr) { + delete matmul_param_; + matmul_param_ = nullptr; + } + if (packed_weight_ != nullptr) { + delete packed_weight_; + packed_weight_ = nullptr; + } + FreeResizeBuf(); + FreeQuantParam(); +} + +void Convolution1x1Int8CPUKernel::FreeResizeBuf() { + if (packed_input_ != nullptr) { + free(packed_input_); + packed_input_ = nullptr; + } + if (input_sum_ != nullptr) { + free(input_sum_); + input_sum_ = nullptr; + } + return; +} + +void Convolution1x1Int8CPUKernel::CheckSupportOptimize() { + support_optimize_ = false; + matmul_func_ = MatMulInt8_16x4_r; +#ifdef ENABLE_ARM64 + void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_; + if (optimize_op_handler != nullptr) { + dlerror(); + *(reinterpret_cast(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler"); + auto dlopen_error = dlerror(); + if (dlopen_error != nullptr) { + MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << "."; + support_optimize_ = false; + matmul_func_ = nullptr; + } else { + support_optimize_ = true; + } + } else { + support_optimize_ = false; + matmul_func_ = nullptr; + } +#endif + + matmul_func_ = MatMulInt8_16x4_r; + return; +} + +int Convolution1x1Int8CPUKernel::InitWeightBias() { + auto filter_tensor = in_tensors_.at(kWeightIndex); + auto input_channel = filter_tensor->Channel(); + auto output_channel = filter_tensor->Batch(); + + /* weight */ + size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t); + packed_weight_ = reinterpret_cast(malloc(size)); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Conv1x1 int8 Malloc weight error!"; + return RET_ERROR; + } + memset(packed_weight_, 0, size); + RowMajor2Row4x16MajorInt8(reinterpret_cast(filter_tensor->Data()), packed_weight_, output_channel, + input_channel); + + /* bias = bias - v2 x zp1 + zp1 x zp2 */ + int col4 = UP_ROUND(output_channel, C4NUM); + bias_data_ = malloc(col4 * sizeof(int32_t)); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Conv1x1 int8 Malloc bias_ptr_ error!"; + return RET_ERROR; + } + memset(bias_data_, 0, col4 * sizeof(int32_t)); + if (in_tensors_.size() == 3) { + memcpy(bias_data_, in_tensors_[kBiasIndex]->Data(), output_channel * sizeof(int32_t)); + } + + int32_t *bias_data = reinterpret_cast(bias_data_); + int8_t *weight = reinterpret_cast(filter_tensor->Data()); + int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_; + for (int oc = 0; oc < output_channel; oc++) { + int32_t weight_sum_value = 0; + int32_t filter_zp = (conv_param_->conv_quant_arg_.filter_arg_num_ == 1) + ? conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_ + : conv_param_->conv_quant_arg_.filter_quant_args_[oc].zp_; + for (int ic = 0; ic < input_channel; ic++) { + weight_sum_value += weight[oc * input_channel + ic]; + } + bias_data[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp; + } + return RET_OK; +} + +int Convolution1x1Int8CPUKernel::Init() { + if (!InferShapeDone()) { + return RET_OK; + } + matmul_param_ = new (std::nothrow) MatMulParameter(); + if (matmul_param_ == nullptr) { + MS_LOG(ERROR) << "Init matmul_param_ failed."; + return RET_ERROR; + } + + CheckSupportOptimize(); + + auto ret = SetQuantParam(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Set quant param failed."; + return ret; + } + + ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init weight bias failed."; + return ret; + } + + return ReSize(); +} + +int Convolution1x1Int8CPUKernel::InitParam() { + pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || + conv_param_->stride_w_ != 1); + + matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; + matmul_param_->deep_ = conv_param_->input_channel_; + matmul_param_->col_ = conv_param_->output_channel_; + + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C4NUM)); + thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C4NUM), thread_count_); + + size_t size = UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM); + packed_input_ = reinterpret_cast(malloc(size * sizeof(int8_t))); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "conv1x1 int8 Malloc packed_input_ error!"; + return RET_ERROR; + } + memset(packed_input_, 0, size * sizeof(int8_t)); + + if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { + size = UP_ROUND(conv_param_->output_channel_, C4NUM) * UP_ROUND(matmul_param_->row_, C4NUM); + } else { + size = UP_ROUND(matmul_param_->row_, C4NUM); + } + input_sum_ = reinterpret_cast(malloc(size * sizeof(int32_t))); + if (input_sum_ == nullptr) { + MS_LOG(ERROR) << "malloc input_sum_ failed."; + return RET_ERROR; + } + memset(input_sum_, 0, size * sizeof(int32_t)); + + return RET_OK; +} + +int Convolution1x1Int8CPUKernel::ReSize() { + FreeResizeBuf(); + + ConvolutionBaseCPUKernel::Init(); + + int error_code = InitParam(); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Convolution base init failed."; + return error_code; + } + return RET_OK; +} + +void Convolution1x1Int8CPUKernel::Pre1x1Trans(int8_t *src_input, int8_t *src_output) { + output_ptr_ = src_output; + if (pre_trans_input_) { + Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(int8_t)); + } else { + input_ptr_ = src_input; + } + RowMajor2Row16x4MajorInt8(input_ptr_, packed_input_, matmul_param_->row_, matmul_param_->deep_); + return; +} + +int Convolution1x1Int8CPUKernel::RunImpl(int task_id) { + int cur_oc = MSMIN(thread_stride_ * C4NUM, matmul_param_->col_ - task_id * thread_stride_ * C4NUM); + if (cur_oc <= 0) { + return RET_OK; + } + + int32_t *bias = reinterpret_cast(bias_data_) + thread_stride_ * C4NUM * task_id; + + Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C4NUM * matmul_param_->deep_, + output_ptr_ + task_id * thread_stride_ * C4NUM, input_sum_, bias + task_id * thread_stride_ * C4NUM, + matmul_param_->row_, cur_oc, UP_ROUND(matmul_param_->deep_, C16NUM), conv_param_, matmul_func_); + return RET_OK; +} + +int Convolution1x1Int8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) { + auto conv = reinterpret_cast(cdata); + auto error_code = conv->RunImpl(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "conv1x1 Int8 Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int Convolution1x1Int8CPUKernel::Run() { + auto ret = Prepare(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Prepare failed."; + return RET_ERROR; + } + + if (pre_trans_input_) { + input_ptr_ = + reinterpret_cast(ctx_->allocator->Malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t))); + if (input_ptr_ == nullptr) { + MS_LOG(ERROR) << "Conv1x1 int8 Malloc input_ptr_ error!"; + return RET_MEMORY_FAILED; + } + } + + int8_t *src_in = reinterpret_cast(in_tensors_[0]->Data()); + int8_t *src_out = reinterpret_cast(out_tensors_[0]->Data()); + + for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { + Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_, + src_out + batch_index * matmul_param_->row_ * matmul_param_->col_); + + PackInputSum16x4Int8(packed_input_, input_sum_, matmul_param_->deep_, matmul_param_->col_, matmul_param_->row_, + conv_param_); + + int error_code = LiteBackendParallelLaunch(Convolution1x1Int8Impl, this, thread_count_); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "conv1x1 fp16 error error_code[" << error_code << "]"; + return RET_ERROR; + } + } + + if (pre_trans_input_ && input_ptr_ != nullptr) { + ctx_->allocator->Free(input_ptr_); + input_ptr_ = nullptr; + } + + return RET_OK; +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h new file mode 100644 index 000000000..f3e201885 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h @@ -0,0 +1,68 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_ + +#include +#include "src/lite_kernel.h" +#include "include/errorcode.h" +#include "schema/model_generated.h" +#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "nnacl/int8/conv_int8.h" +#include "nnacl/int8/matmul_int8.h" +#include "nnacl/matmul_parameter.h" +#include "nnacl/optimized_kernel.h" + +namespace mindspore::kernel { +class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel { + public: + Convolution1x1Int8CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const Context *ctx, + const mindspore::lite::PrimitiveC *primitive) + : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~Convolution1x1Int8CPUKernel() override; + + int Init() override; + int ReSize() override; + int Run() override; + + public: + int RunImpl(int task_id); + + private: + void FreeResizeBuf(); + int InitParam(); + int InitWeightBias(); + void Pre1x1Trans(int8_t *src_input, int8_t *src_output); + void CheckSupportOptimize(); + + private: + int32_t *input_sum_ = nullptr; /* per-channel: oc4 format */ + int8_t *packed_weight_ = nullptr; + int8_t *packed_input_ = nullptr; + int8_t *input_ptr_ = nullptr; + int8_t *output_ptr_ = nullptr; + size_t thread_count_ = 1; + size_t thread_stride_ = 0; + bool pre_trans_input_ = false; + MatMulParameter *matmul_param_ = nullptr; + MATMUL_OPT_R_FUNC matmul_func_ = nullptr; + bool support_optimize_ = false; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc index 36760454e..21cd2ff7c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc @@ -16,6 +16,7 @@ #include "src/runtime/kernel/arm/int8/convolution_int8.h" #include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h" +#include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h" #include "nnacl/int8/conv_int8.h" #include "src/runtime/kernel/arm/base/layout_transform.h" #include "schema/model_generated.h" @@ -400,6 +401,9 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vectorpad_h_ = conv_param->pad_w_ = 2; float out[20] = {0}; - Conv1x1InputPackFp32(in, out, conv_param); + Conv1x1InputPack(in, out, conv_param, sizeof(float)); EXPECT_EQ(0, lite::CompareOutputData(out, correct, 20)); delete conv_param; } @@ -95,7 +95,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack2) { conv_param->pad_h_ = conv_param->pad_w_ = 0; float out[28] = {0}; - Conv1x1InputPackFp32(in, out, conv_param); + Conv1x1InputPack(in, out, conv_param, sizeof(float)); CompareOutputData(out, correct, 28, 0.0001); delete conv_param; } @@ -114,7 +114,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack3) { float correct[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 17.025112, -5.052577, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - Conv1x1InputPackFp32(in, out, conv_param); + Conv1x1InputPack(in, out, conv_param, sizeof(float)); EXPECT_EQ(0, lite::CompareOutputData(out, correct, 18)); delete conv_param; } @@ -136,7 +136,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) { -1.770, 41.903, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; float out[54] = {0}; - Conv1x1InputPackFp32(in, out, conv_param); + Conv1x1InputPack(in, out, conv_param, sizeof(float)); EXPECT_EQ(0, lite::CompareOutputData(out, correct, 54)); delete conv_param; } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc new file mode 100644 index 000000000..00abebfce --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc @@ -0,0 +1,281 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "utils/log_adapter.h" +#include "common/common_test.h" +#include "mindspore/lite/src/lite_kernel.h" +#include "src/common/file_utils.h" +#include "nnacl/quantization/quantize.h" +#include "nnacl/common_func.h" +#include "mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h" + +namespace mindspore { +using lite::tensor::Tensor; +class TestConv1x1Int8 : public mindspore::CommonTest { + public: + TestConv1x1Int8() {} +}; + +TEST_F(TestConv1x1Int8, Input1x1PrePack1) { + auto conv_param = new ConvParameter(); + conv_param->input_channel_ = 6; + conv_param->input_h_ = conv_param->input_w_ = 3; + conv_param->output_h_ = conv_param->output_w_ = 3; + conv_param->stride_h_ = conv_param->stride_w_ = 2; + conv_param->pad_h_ = conv_param->pad_w_ = 1; + int8_t in[] = {4, 13, -3, 16, 19, 8, 19, -6, -2, -9, 9, 18, 23, 8, 47, -14, 15, 4, + -0, 37, -0, 6, 0, -1, 37, 13, 11, 1, -1, 41, 9, 14, 3, 0, 8, 9, + 14, -14, -8, -8, -8, 7, 19, 17, 13, 3, 9, 18, -1, -0, 18, 0, 4, -2}; + int8_t correct[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37, 13, 11, + 1, -1, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int8_t out[54] = {0}; + Conv1x1InputPack(in, out, conv_param, sizeof(int8_t)); + CompareOutputData(out, correct, 54, 0); + delete conv_param; +} + +TEST_F(TestConv1x1Int8, Input1x1PrePack2) { + auto conv_param = new ConvParameter(); + int8_t in[] = {-0, -0, -7, -0, -6, 4, 9, 9, 12, -0, 6, 2, 13, 15, 16, -7, 9, 1, 10, 13, 17, 17, 4, 13, + -6, 5, 7, -7, 15, 0, 1, -5, -7, 18, 15, 19, -7, 13, 7, -0, 16, -5, 16, -7, 6, 10, -5, 10, + 9, 12, -9, -8, -4, 18, -5, 0, 7, 12, 13, 16, -9, -4, 18, -0, 8, 6, 2, 10, 16, 1, -1, 2, + 9, 8, 9, 13, 7, -0, 15, -7, 0, -0, 17, 19, 9, 17, -6, -2, 7, -0, 10, -6, -6, 18, -0, 9, + 9, 6, 3, -1, -8, 10, 17, -9, 17, 6, -3, 7, -2, -0, -9, 1, -3, 15, 13, 4, 18}; + int8_t correct[] = {0, 0, 0, 0, 0, 0, 15, -7, -7, 0, 0, 0, 9, 7, 0, 0, 0, 0, 0, 0}; + + conv_param->input_h_ = 9; + conv_param->input_w_ = 13; + conv_param->input_channel_ = 1; + conv_param->output_h_ = 4; + conv_param->output_w_ = 5; + conv_param->stride_h_ = conv_param->stride_w_ = 4; + conv_param->pad_h_ = conv_param->pad_w_ = 2; + + int8_t out[20] = {0}; + Conv1x1InputPack(in, out, conv_param, sizeof(int8_t)); + CompareOutputData(out, correct, 20, 0); + delete conv_param; +} + +int Conv1x1Int8TestInit1_perchannel(std::vector *inputs_, + std::vector *outputs_, ConvParameter *conv_param, + int8_t **correct) { + Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast(1)); + auto in_quant_arg = new mindspore::lite::tensor::QuantArg(); + in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647; + in_t->AddQuantParam(*in_quant_arg); + in_t->MallocData(); + int8_t in[] = {62, -14, 88, 2, -35, 43, 83, -111, 75, 26, 14, -121, + -78, 56, 37, -31, 15, -75, -10, -115, -71, 74, -65, -15}; + memcpy(in_t->Data(), in, in_t->ElementsNum() * sizeof(int8_t)); + inputs_->push_back(in_t); + + Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast(1)); + weight_t->MallocData(); + auto weight_quant_arg1 = new mindspore::lite::tensor::QuantArg(); + weight_quant_arg1->zeroPoint = 66, weight_quant_arg1->scale = 0.96439215686275; + auto weight_quant_arg2 = new mindspore::lite::tensor::QuantArg(); + weight_quant_arg2->zeroPoint = 33, weight_quant_arg2->scale = 0.76439215686275; + auto weight_quant_arg3 = new mindspore::lite::tensor::QuantArg(); + weight_quant_arg3->zeroPoint = -20, weight_quant_arg3->scale = 0.99117647; + weight_t->AddQuantParam(*weight_quant_arg1); + weight_t->AddQuantParam(*weight_quant_arg2); + weight_t->AddQuantParam(*weight_quant_arg3); + int8_t weight[] = {65, 67, 65, 65, 32, 33, 34, 33, -19, -20, -19, -20}; + memcpy(weight_t->Data(), weight, weight_t->ElementsNum() * sizeof(int8_t)); + inputs_->push_back(weight_t); + + Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast(1)); + out_t->MallocData(); + auto output_quant_arg = new mindspore::lite::tensor::QuantArg(); + output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.294321233; + out_t->AddQuantParam(*output_quant_arg); + outputs_->push_back(out_t); + + *correct = reinterpret_cast(malloc(out_t->ElementsNum() * sizeof(int8_t))); + int8_t nchw_co[] = {-83, 34, 100, 10, 113, 55, 3, 16, 63, 6, 93, 20, 5, 6, 42, 35, 28, -24}; + memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(int8_t)); + + conv_param->kernel_h_ = conv_param->kernel_w_ = 1; + conv_param->stride_h_ = conv_param->stride_w_ = 1; + conv_param->dilation_h_ = conv_param->dilation_w_ = 1; + conv_param->pad_h_ = conv_param->pad_w_ = 0; + conv_param->is_relu_ = conv_param->is_relu6_ = false; + return out_t->ElementsNum(); +} + +TEST_F(TestConv1x1Int8, Conv1x1TestPerChannel) { + std::vector inputs_; + std::vector outputs_; + auto conv_param = new ConvParameter(); + int8_t *correct; + auto ctx = new lite::Context; + ctx->thread_num_ = 1; + int total_size = Conv1x1Int8TestInit1_perchannel(&inputs_, &outputs_, conv_param, &correct); + kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel( + reinterpret_cast(conv_param), inputs_, outputs_, ctx, nullptr); + + conv1x1->Init(); + conv1x1->Run(); + CompareOutputData(reinterpret_cast(outputs_[0]->Data()), correct, total_size, 70); + + delete conv1x1; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + free(correct); +} + +int Conv1x1Int8TestInit1(std::vector *inputs_, std::vector *outputs_, + ConvParameter *conv_param, int8_t **correct) { + Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast(1)); + auto in_quant_arg = new mindspore::lite::tensor::QuantArg(); + in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647; + in_t->AddQuantParam(*in_quant_arg); + in_t->MallocData(); + float in[] = {12.216284, 3.3466918, 15.327419, 5.234958, 0.804376, 9.952188, 14.727955, -8.080715, + 13.71383, 8.055829, 6.5845337, -9.25232, -4.24519, 11.550042, 9.262012, 1.2780352, + 6.7263746, -3.9301445, 3.764492, -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505}; + Quantize(in, in_t->ElementsNum(), in_quant_arg->scale, in_quant_arg->zeroPoint, + reinterpret_cast(in_t->Data())); + inputs_->push_back(in_t); + + Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast(1)); + auto weight_quant_arg = new mindspore::lite::tensor::QuantArg(); + weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275; + weight_t->AddQuantParam(*weight_quant_arg); + weight_t->MallocData(); + float weight[] = {-0.7308652, 0.5257509, -0.87825793, -1.123181, -1.2206168, 0.562695, + 1.5382664, -0.5020635, 0.8591602, -0.26410004, 1.1262615, 0.073132955}; + Quantize(weight, weight_t->ElementsNum(), weight_quant_arg->scale, weight_quant_arg->zeroPoint, + reinterpret_cast(weight_t->Data())); + inputs_->push_back(weight_t); + + Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast(1)); + out_t->MallocData(); + auto output_quant_arg = new mindspore::lite::tensor::QuantArg(); + output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233; + out_t->AddQuantParam(*output_quant_arg); + outputs_->push_back(out_t); + + *correct = reinterpret_cast(malloc(out_t->ElementsNum() * sizeof(int8_t))); + float nchw_co[] = {-26.51016327, 7.92113757, 27.25741343, 0.785643655, 31.3307619, 14.05927672, + -1.178490666, 2.5676252, 16.39408946, -0.394793726, 25.2866881, 3.827249175, + -0.626854507, -0.3122176, 10.42769169, 8.362184085, 6.04617807, -9.252362384}; + Quantize(nchw_co, out_t->ElementsNum(), output_quant_arg->scale, output_quant_arg->zeroPoint, *correct); + + conv_param->kernel_h_ = conv_param->kernel_w_ = 1; + conv_param->stride_h_ = conv_param->stride_w_ = 1; + conv_param->dilation_h_ = conv_param->dilation_w_ = 1; + conv_param->pad_h_ = conv_param->pad_w_ = 0; + conv_param->is_relu_ = conv_param->is_relu6_ = false; + return out_t->ElementsNum(); +} + +TEST_F(TestConv1x1Int8, Conv1x1Int8Test1) { + std::vector inputs_; + std::vector outputs_; + auto conv_param = new ConvParameter(); + int8_t *correct; + auto ctx = new lite::Context; + ctx->thread_num_ = 1; + int total_size = Conv1x1Int8TestInit1(&inputs_, &outputs_, conv_param, &correct); + kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel( + reinterpret_cast(conv_param), inputs_, outputs_, ctx, nullptr); + + conv1x1->Init(); + conv1x1->Run(); + CompareOutputData(reinterpret_cast(outputs_[0]->Data()), correct, total_size, 2); + + delete conv1x1; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + free(correct); +} + +int Conv1x1Int8TestInit2(std::vector *inputs_, std::vector *outputs_, + ConvParameter *conv_param, int8_t **correct) { + size_t buffer_size; + Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast(1)); + auto in_quant_arg = new mindspore::lite::tensor::QuantArg(); + in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647; + in_t->AddQuantParam(*in_quant_arg); + in_t->MallocData(); + std::string input_path = "./input"; + auto input = mindspore::lite::ReadFile(input_path.c_str(), &buffer_size); + memcpy(in_t->Data(), input, buffer_size); + inputs_->push_back(in_t); + delete[] input; + + Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast(1)); + auto weight_quant_arg = new mindspore::lite::tensor::QuantArg(); + weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275; + weight_t->AddQuantParam(*weight_quant_arg); + weight_t->MallocData(); + std::string weight_path = "./weight"; + auto weight = mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size); + memcpy(weight_t->Data(), weight, buffer_size); + inputs_->push_back(weight_t); + delete[] weight; + + Tensor *bias_t = new Tensor(kNumberTypeInt32, {4}, schema::Format_NHWC, static_cast(1)); + weight_t->MallocData(); + std::string bias_path = "./bias"; + auto bias = mindspore::lite::ReadFile(bias_path.c_str(), &buffer_size); + memcpy(bias_t->Data(), bias, buffer_size); + inputs_->push_back(bias_t); + delete[] bias; + + Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast(1)); + out_t->MallocData(); + auto output_quant_arg = new mindspore::lite::tensor::QuantArg(); + output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233; + out_t->AddQuantParam(*output_quant_arg); + outputs_->push_back(out_t); + + *correct = reinterpret_cast(malloc(out_t->ElementsNum() * sizeof(int8_t))); + std::string output_path = "./output"; + auto output = mindspore::lite::ReadFile(output_path.c_str(), &buffer_size); + memcpy(*correct, output, buffer_size); + delete[] output; + + conv_param->kernel_h_ = conv_param->kernel_w_ = 1; + conv_param->stride_h_ = conv_param->stride_w_ = 1; + conv_param->dilation_h_ = conv_param->dilation_w_ = 1; + conv_param->pad_h_ = conv_param->pad_w_ = 0; + conv_param->is_relu_ = conv_param->is_relu6_ = false; + return out_t->ElementsNum(); +} + +TEST_F(TestConv1x1Int8, Conv1x1Int8Test2) { + std::vector inputs_; + std::vector outputs_; + auto conv_param = new ConvParameter(); + int8_t *correct; + auto ctx = new lite::Context; + ctx->thread_num_ = 1; + int total_size = Conv1x1Int8TestInit2(&inputs_, &outputs_, conv_param, &correct); + kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel( + reinterpret_cast(conv_param), inputs_, outputs_, ctx, nullptr); + + conv1x1->Init(); + conv1x1->Run(); + CompareOutputData(reinterpret_cast(outputs_[0]->Data()), correct, total_size, 2); + + delete conv1x1; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + free(correct); +} +} // namespace mindspore -- GitLab