Fix the bug of opt matmul int8 working on multiple threadings

0ec0ddc8 · zhanyuan · liuwenhao4 · 1821e98e · 0ec0ddc8 · 0ec0ddc8
5 changed file
--- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
@@ -421,13 +421,37 @@ End3:
  smax v17.4s, v17.4s, v7.4s
  smax v18.4s, v18.4s, v7.4s
  smax v19.4s, v19.4s, v7.4s
-
-  // Apply the act_min bound
+  smax v20.4s, v20.4s, v7.4s
+  smax v21.4s, v21.4s, v7.4s
+  smax v22.4s, v22.4s, v7.4s
+  smax v23.4s, v23.4s, v7.4s
+  smax v24.4s, v24.4s, v7.4s
+  smax v25.4s, v25.4s, v7.4s
+  smax v26.4s, v26.4s, v7.4s
+  smax v27.4s, v27.4s, v7.4s
+  smax v28.4s, v28.4s, v7.4s
+  smax v29.4s, v29.4s, v7.4s
+  smax v30.4s, v30.4s, v7.4s
+  smax v31.4s, v31.4s, v7.4s
+
+  // Apply the act_max bound
  dup v6.4s, w9
  smin v16.4s, v16.4s, v6.4s
  smin v17.4s, v17.4s, v6.4s
  smin v18.4s, v18.4s, v6.4s
  smin v19.4s, v19.4s, v6.4s
+  smin v20.4s, v20.4s, v6.4s
+  smin v21.4s, v21.4s, v6.4s
+  smin v22.4s, v22.4s, v6.4s
+  smin v23.4s, v23.4s, v6.4s
+  smin v24.4s, v24.4s, v6.4s
+  smin v25.4s, v25.4s, v6.4s
+  smin v26.4s, v26.4s, v6.4s
+  smin v27.4s, v27.4s, v6.4s
+  smin v28.4s, v28.4s, v6.4s
+  smin v29.4s, v29.4s, v6.4s
+  smin v30.4s, v30.4s, v6.4s
+  smin v31.4s, v31.4s, v6.4s

  // int32 -> int16
  sqxtn v0.4h, v16.4s

--- a/mindspore/lite/nnacl/opt_op_handler.c
+++ b/mindspore/lite/nnacl/opt_op_handler.c
@@ -57,6 +57,6 @@ void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst,
                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
                                  int32_t maxi, bool per_channel) {
  return MatmulInt8DpNeon64(a, b, dst, UP_ROUND(row, 8), UP_ROUND(col, 8), deep_4, input_sum, bias, mini, maxi,
-                            output_zp, multiplier[0], left_shift[0], right_shift[0], row, col, col);
+                            output_zp, multiplier[0], left_shift[0], right_shift[0], row, col, stride);
 }
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
@@ -38,7 +38,7 @@ Convolution1x1Int8CPUKernel::~Convolution1x1Int8CPUKernel() {
    matmul_param_ = nullptr;
  }
  if (packed_weight_ != nullptr) {
-    delete packed_weight_;
+    free(packed_weight_);
    packed_weight_ = nullptr;
  }
  FreeResizeBuf();

--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
@@ -60,6 +60,7 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
  for (int i = 0; i < weight_tensor->ElementsNum(); i++) {
    packed_weight_[i] = (int16_t)(tmp_weight[i] - weight_zp);
  }
+  free(tmp_weight);

  bias_data_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
  if (bias_data_ == nullptr) {

--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -402,7 +402,7 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::tensor::Ten
  if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
    kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  } else if (kernel_h == 1 && kernel_w == 1 && filter_quant_size == 1) {
-    kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+    kernel = new (std::nothrow) kernel::Convolution1x1Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  } else {
    kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  }