deconv int8

6fefe939 · ling · d541e261 · 6fefe939 · 6fefe939 · 6fefe939
14 changed file
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc
@@ -27,7 +27,10 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DeConv2D;

 namespace mindspore::kernel {
-DeConvInt8CPUKernel::~DeConvInt8CPUKernel() { FreeTmpBuffer(); }
+DeConvInt8CPUKernel::~DeConvInt8CPUKernel() {
+  FreeTmpBuffer();
+  ConvolutionBaseCPUKernel::FreeQuantParam();
+}

 void DeConvInt8CPUKernel::FreeTmpBuffer() {
  if (weight_ptr_ != nullptr) {
@@ -46,20 +49,18 @@ void DeConvInt8CPUKernel::FreeTmpBuffer() {
    free(tmp_output_);
    tmp_output_ = nullptr;
  }
-  ConvolutionBaseCPUKernel::FreeQuantParam();
+  if (input_sum_ != nullptr) {
+    free(input_sum_);
+    input_sum_ = nullptr;
+  }
+  return;
 }

 int DeConvInt8CPUKernel::ReSize() {
  FreeTmpBuffer();

  ConvolutionBaseCPUKernel::Init();
-  int error_code = ConvolutionBaseCPUKernel::SetQuantParam();
-  if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "deconv int8 SetQuantParam error!";
-    return error_code;
-  }
-
-  error_code = InitParam();
+  int error_code = InitParam();
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "deconv int8 InitParam error!";
    return error_code;
@@ -79,76 +80,117 @@ int DeConvInt8CPUKernel::ReSize() {
  return RET_OK;
 }

+int DeConvInt8CPUKernel::Init() {
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+
+  CheckSupportOptimize();
+
+  int error_code = ConvolutionBaseCPUKernel::SetQuantParam();
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "deconv int8 SetQuantParam error!";
+    return error_code;
+  }
+  return ReSize();
+}
+
+void DeConvInt8CPUKernel::CheckSupportOptimize() {
+  matmul_func_ = nullptr;
+  support_optimize_ = false;
+
+#ifdef ENABLE_ARM64
+  /* todo */
+#endif
+
+  support_optimize_ = true;
+  matmul_func_ = MatMulOptR4Int8;
+}
+
 int DeConvInt8CPUKernel::InitParam() {
-  fc_param_ = new MatMulParameter();
-  fc_param_->row_ = conv_param_->input_h_ * conv_param_->input_w_;
-  fc_param_->deep_ = conv_param_->input_channel_;
-  fc_param_->col_ = conv_param_->output_channel_ * conv_param_->kernel_h_ * conv_param_->kernel_w_;
-  fc_param_->row_8_ = UP_ROUND(fc_param_->row_, C8NUM);
-  fc_param_->col_8_ = UP_ROUND(conv_param_->output_channel_, C8NUM) * conv_param_->kernel_h_ * conv_param_->kernel_w_;
-
-  size_t oc8 = UP_DIV(conv_param_->output_channel_, C8NUM);
-  thread_count_ = MSMIN(op_parameter_->thread_num_, oc8);
-  thread_stride_ = UP_DIV(oc8, thread_count_) * C8NUM;
+  matmul_param_ = new MatMulParameter();
+  matmul_param_->row_ = conv_param_->input_h_ * conv_param_->input_w_;
+  matmul_param_->deep_ = conv_param_->input_channel_;
+  matmul_param_->col_ = conv_param_->output_channel_ * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+
+  if (support_optimize_) {
+    input_trans_func_ = RowMajor2Row16x4MajorInt8;
+    size_t oc4 = UP_DIV(conv_param_->output_channel_, C4NUM);
+    thread_count_ = MSMIN(op_parameter_->thread_num_, oc4);
+    thread_stride_ = UP_DIV(oc4, thread_count_);
+  } else {
+    /*todo */
+  }
  return RET_OK;
 }

 int DeConvInt8CPUKernel::InitBiasWeight() {
+  size_t size = UP_ROUND(conv_param_->output_channel_, C4NUM) * sizeof(int32_t);
+  bias_data_ = malloc(size);
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "deconv int8 malloc bias_data_ error!";
+    return RET_ERROR;
+  }
+  memset(bias_data_, 0, size);
  if (in_tensors_.size() == 3) {
-    size_t size = UP_ROUND(conv_param_->output_channel_, C8NUM) * sizeof(int32_t);
-    bias_data_ = malloc(size);
-    if (bias_data_ == nullptr) {
-      MS_LOG(ERROR) << "deconv int8 malloc bias_data_ error!";
-      return RET_ERROR;
-    }
-    memset(bias_data_, 0, size);
    memcpy(bias_data_, in_tensors_[0]->Data(), conv_param_->output_channel_ * sizeof(int32_t));
-  } else {
-    bias_data_ = nullptr;
  }

-  /* weight:  ichwoc(nhwc)  ->  oc8 * h * w * inc * 8 */
-  size_t size = conv_param_->kernel_w_ * conv_param_->kernel_h_ * UP_ROUND(conv_param_->output_channel_, C8NUM) *
-                conv_param_->input_channel_ * sizeof(int8_t);
+  size = UP_ROUND(conv_param_->output_channel_, C4NUM) * UP_ROUND(conv_param_->input_channel_, C16NUM) *
+         conv_param_->kernel_w_ * conv_param_->kernel_h_ * sizeof(int8_t);
  weight_ptr_ = reinterpret_cast<int8_t *>(malloc(size));
  if (weight_ptr_ == nullptr) {
    MS_LOG(ERROR) << "deconv int8 malloc weight_ptr_ error!";
    return RET_ERROR;
  }
-  memset(weight_ptr_, 0, size);
-  PackNHWCToC8HWN8Int8(in_tensors_[1]->Data(), weight_ptr_, conv_param_->input_channel_,
-                       conv_param_->kernel_h_ * conv_param_->kernel_w_, conv_param_->output_channel_);
+  memset(weight_ptr_, static_cast<int8_t>(conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_), size);
+  DeConvWeightTransInt8(reinterpret_cast<int8_t *>(in_tensors_[1]->Data()), weight_ptr_, conv_param_->input_channel_,
+                        conv_param_->output_channel_, conv_param_->kernel_h_ * conv_param_->kernel_w_,
+                        support_optimize_);
+
+  size = UP_ROUND(conv_param_->output_channel_, C4NUM) * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+  weight_sum_ = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t)));
+  if (weight_sum_ == nullptr) {
+    MS_LOG(ERROR) << "deconv int8 malloc weight_sum_ error!";
+    return RET_ERROR;
+  }
+  memset(weight_sum_, 0, size * sizeof(int32_t));
+  DeConvPackWeightSum(weight_ptr_, weight_sum_, conv_param_->conv_quant_arg_.input_quant_args_[0].zp_,
+                      conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_, UP_ROUND(matmul_param_->deep_, C16NUM),
+                      size, support_optimize_);
+
  return RET_OK;
 }

 int DeConvInt8CPUKernel::InitData() {
-  int size = UP_ROUND(conv_param_->input_h_ * conv_param_->input_w_, C8NUM) * conv_param_->input_channel_;
+  int size =
+    UP_ROUND(conv_param_->input_h_ * conv_param_->input_w_, C4NUM) * UP_ROUND(conv_param_->input_channel_, C16NUM);
  input_ptr_ = reinterpret_cast<int8_t *>(malloc(size * sizeof(int8_t)));
  if (input_ptr_ == nullptr) {
    return RET_MEMORY_FAILED;
  }
-  memset(input_ptr_, 0, size * sizeof(int8_t));
+  memset(input_ptr_, static_cast<int8_t>(conv_param_->conv_quant_arg_.input_quant_args_[0].zp_), size * sizeof(int8_t));

-  size = UP_ROUND(conv_param_->input_h_ * conv_param_->input_w_, C8NUM) *
-         UP_ROUND(conv_param_->output_channel_, C8NUM) * conv_param_->kernel_w_ * conv_param_->kernel_h_;
+  size = UP_ROUND(conv_param_->input_h_ * conv_param_->input_w_, C4NUM) *
+         UP_ROUND(conv_param_->output_channel_, C4NUM) * conv_param_->kernel_w_ * conv_param_->kernel_h_;
  tmp_buffer_ = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t)));
  if (tmp_buffer_ == nullptr) {
    return RET_MEMORY_FAILED;
  }

-  size = UP_ROUND(conv_param_->output_channel_, C8NUM) * conv_param_->output_h_ * conv_param_->output_w_;
+  size = UP_ROUND(conv_param_->output_channel_, C4NUM) * conv_param_->output_h_ * conv_param_->output_w_;
  tmp_output_ = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t)));
  if (tmp_output_ == nullptr) {
    return RET_MEMORY_FAILED;
  }
-  return RET_OK;
-}

-int DeConvInt8CPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
+  size = UP_ROUND(matmul_param_->row_, C4NUM);
+  input_sum_ = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t)));
+  if (input_sum_ == nullptr) {
+    return RET_MEMORY_FAILED;
  }
-  return ReSize();
+
+  return RET_OK;
 }

 int DeConvInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
@@ -161,46 +203,26 @@ int DeConvInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
  return RET_OK;
 }

-int DeConvInt8PostFuncRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
-  auto deconv = reinterpret_cast<DeConvInt8CPUKernel *>(cdata);
-  auto error_code = deconv->DoPostFunc(task_id);
-  if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "DeConvInt8PostFuncRun error task_id[" << task_id << "] error_code[" << error_code << "]";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
 int DeConvInt8CPUKernel::DoDeconv(int task_id) {
-  int cur_oc = MSMIN(thread_stride_, UP_ROUND(conv_param_->output_channel_, C8NUM) - task_id * thread_stride_);
+  int cur_oc = MSMIN(thread_stride_, UP_DIV(conv_param_->output_channel_, C8NUM) - task_id * thread_stride_);
+  int cur_oc_res = MSMIN(thread_stride_ * C4NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C4NUM);
  if (cur_oc <= 0) {
    return RET_OK;
  }

-  int input_plane = conv_param_->input_h_ * conv_param_->input_w_;
-  int kernel_plane = conv_param_->kernel_w_ * conv_param_->kernel_h_;
-
-  DeConvInt8(input_ptr_, weight_ptr_ + task_id * thread_stride_ * kernel_plane * conv_param_->input_channel_,
-             tmp_buffer_ + task_id * thread_stride_ * input_plane * kernel_plane, fc_param_->row_8_,
-             cur_oc * kernel_plane, fc_param_->deep_, conv_param_);
-
-  return RET_OK;
-}
-
-int DeConvInt8CPUKernel::DoPostFunc(int task_id) {
  int input_plane = conv_param_->input_h_ * conv_param_->input_w_;
  int kernel_plane = conv_param_->kernel_w_ * conv_param_->kernel_h_;
  int output_plane = conv_param_->output_h_ * conv_param_->output_w_;

-  int cur_oc = MSMIN(thread_stride_, conv_param_->output_channel_ - task_id * thread_stride_);
-  if (cur_oc <= 0) {
-    return RET_OK;
-  }
+  DeConvInt8(input_ptr_, weight_ptr_ + task_id * thread_stride_ * C4NUM * kernel_plane * conv_param_->input_channel_,
+             tmp_buffer_ + task_id * thread_stride_ * C4NUM * input_plane * kernel_plane, weight_sum_, input_sum_,
+             UP_ROUND(matmul_param_->row_, C4NUM), cur_oc * C4NUM * kernel_plane,
+             UP_ROUND(matmul_param_->deep_, C16NUM), conv_param_, matmul_func_);

-  DeConvPostInt8(tmp_buffer_ + task_id * thread_stride_ * input_plane * kernel_plane,
-                 reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_,
-                 tmp_output_ + task_id * thread_stride_ * output_plane, output_ptr_ + task_id * thread_stride_, cur_oc,
-                 conv_param_);
+  DeConvPostInt8(tmp_buffer_ + task_id * thread_stride_ * C4NUM * input_plane * kernel_plane,
+                 reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C4NUM,
+                 tmp_output_ + task_id * thread_stride_ * C4NUM * output_plane,
+                 output_ptr_ + task_id * thread_stride_ * C4NUM, cur_oc_res, conv_param_, support_optimize_);
  return RET_OK;
 }

@@ -214,20 +236,18 @@ int DeConvInt8CPUKernel::Run() {
  int8_t *src_out = reinterpret_cast<int8_t *>(out_tensors_[0]->Data());

  for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
-    RowMajor2Col8MajorInt8(src_in + batch_index * fc_param_->row_ * conv_param_->input_channel_, input_ptr_,
-                           fc_param_->row_, fc_param_->deep_);
-    output_ptr_ = src_out + batch_index * fc_param_->col_;
+    input_trans_func_(src_in + batch_index * matmul_param_->row_ * conv_param_->input_channel_, input_ptr_,
+                      matmul_param_->row_, matmul_param_->deep_);
+    output_ptr_ = src_out + batch_index * matmul_param_->col_;
+
+    DeConvPackInputSum(input_ptr_, input_sum_, conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
+                       UP_ROUND(matmul_param_->row_, C4NUM), UP_ROUND(matmul_param_->deep_, C16NUM), support_optimize_);

    int error_code = LiteBackendParallelLaunch(DeConvInt8Run, this, thread_count_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "deconv int8 run error! error_code[" << error_code << "]";
      return RET_ERROR;
    }
-    error_code = LiteBackendParallelLaunch(DeConvInt8PostFuncRun, this, thread_count_);
-    if (error_code != RET_OK) {
-      MS_LOG(ERROR) << "deconv int8 post run error! error_code[" << error_code << "]";
-      return RET_ERROR;
-    }
  }

  return RET_OK;

--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.h
@@ -23,6 +23,7 @@
 #include "include/errorcode.h"
 #include "src/runtime/kernel/arm/nnacl/matmul_parameter.h"
 #include "src/runtime/kernel/arm/nnacl/int8/deconv.h"
+#include "src/runtime/kernel/arm/nnacl/int8/common_func.h"
 #include "src/runtime/kernel/arm/nnacl/int8/matmul_int8.h"
 #include "src/runtime/kernel/arm/base/layout_transform.h"
 #include "src/runtime/kernel/arm/base/convolution_base.h"
@@ -43,23 +44,28 @@ class DeConvInt8CPUKernel : public ConvolutionBaseCPUKernel {

 public:
  int DoDeconv(int task_id);
-  int DoPostFunc(int task_id);

 private:
+  void FreeTmpBuffer();
  int InitData();
  int InitParam();
  int InitBiasWeight();
+  void CheckSupportOptimize();

 private:
-  void FreeTmpBuffer();
-  MatMulParameter *fc_param_ = nullptr;
-  int8_t *weight_ptr_ = nullptr;
-  int8_t *input_ptr_ = nullptr;   /* record c8 input*/
  int32_t *tmp_buffer_ = nullptr; /* record matmul result */
  int32_t *tmp_output_ = nullptr; /* record post c8 result */
+  int32_t *input_sum_ = nullptr;  /* record in * w_zp  */
+  int32_t *weight_sum_ = nullptr; /* record w_v * in_zp - in_zp * w_zp */
+  int8_t *input_ptr_ = nullptr;   /* packed input */
+  int8_t *weight_ptr_ = nullptr;  /* packed weight */
  int8_t *output_ptr_ = nullptr;
-  size_t thread_count_;
-  size_t thread_stride_;
+  size_t thread_count_ = 1;
+  size_t thread_stride_ = 0;
+  MATMUL_OPT_R4_FUNC matmul_func_;
+  MAT_TRANS_FUNC input_trans_func_;
+  MatMulParameter *matmul_param_ = nullptr;
+  bool support_optimize_ = true;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_DECONVOLUTION_INT8_H_
--- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
@@ -129,8 +129,8 @@ int FullconnectionInt8CPUKernel::Run() {
  auto &p = quant_params_;
  RowMajor2Col8MajorInt8(a_ptr, a_c8_ptr_, fc_param_->row_, fc_param_->deep_);
  LiteBackendParallelLaunch(FcInt8Run, this, thread_count_);
-  PostFuncInt8(c_r8x8_ptr_, bias_ptr_, output_ptr, fc_param_->col_, fc_param_->row_, fc_param_->row_8_,
-               p.quant_multiplier, p.left_shift, p.right_shift, p.output.zp_, p.out_act_min, p.out_act_max);
+  PostFuncInt8C8(c_r8x8_ptr_, bias_ptr_, output_ptr, fc_param_->col_, fc_param_->row_, p.quant_multiplier, p.left_shift,
+                 p.right_shift, p.output.zp_, p.out_act_min, p.out_act_max);
  return RET_OK;
 }


--- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
@@ -21,6 +21,7 @@
 #include "include/context.h"
 #include "src/runtime/kernel/arm/nnacl/quantization/quantize.h"
 #include "src/runtime/kernel/arm/base/fullconnection_base.h"
+#include "src/runtime/kernel/arm/nnacl/int8/common_func.h"

 using mindspore::lite::Context;


--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.c
@@ -228,27 +228,6 @@ void IndirectGemmFp32_Comm(float *output, const float *input, const float *weigh
  return;
 }

-void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
-                  int32_t left_shift, int32_t right_shift, int32_t zp, int8_t mini, int8_t maxi) {
-  /*  (int32_t)row8x8-major * multiplier + bias  =>  (int8)relu  =>  (int8_t)row-major  */
-  for (int r = 0; r < plane; r++) {
-    for (int c = 0; c < oc; c++) {
-      int c8div = c / 8, c8mod = c % 8;
-      int src_index = c8div * plane8 * 8 + r * 8 + c8mod;
-      int dst_index = r * oc + c;
-      int32_t value = in[src_index];
-      if (bias != NULL) {
-        value = in[src_index] + bias[c];
-      }
-      value = MultiplyByQuantizedMultiplier(value, multiplier, left_shift, right_shift) + zp;
-      value = MSMIN(maxi, value);
-      value = MSMAX(mini, value);
-      out[dst_index] = (int8_t)value;
-    }
-  }
-  return;
-}
-
 void SimplePostFuncInt8(const int *in, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
                        int32_t left_shift, int32_t right_shift, int32_t zp) {
  /*  (int32_t)row8x8-major * multiplier => (int8_t)row-major  */
@@ -265,4 +244,3 @@ void SimplePostFuncInt8(const int *in, int8_t *out, int oc, int plane, int plane
    }
  }
 }
-
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/common_func.h
@@ -31,8 +31,6 @@ int8_t MinInt8(int8_t a, int8_t b);
 int8_t MaxInt8(int8_t a, int8_t b);
 void ReluFp32(float *data, float *dst, int ele_num);
 void Relu6Fp32(float *data, float *dst, int ele_num);
-void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
-                  int32_t left_shift, int32_t right_shift, int32_t zp, int8_t mini, int8_t maxi);
 void SimplePostFuncInt8(const int *in, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
                        int32_t left_shift, int32_t right_shift, int32_t zp);
 void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight, const float *bias, size_t step,

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/common_func.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/common_func.c
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/int8/common_func.h"
+
+void PostConvFuncCommInt8(const int32_t *in, int8_t *out, const int32_t *bias, size_t oc, size_t plane,
+                          size_t out_oc_stride, size_t in_plane_stride, int32_t multiplier, int8_t mini, int8_t maxi,
+                          int32_t left_shift, int32_t right_shift, int32_t zp, int size) {
+  if (size == 0) {
+    return;
+  }
+  for (int r = 0; r < plane; r++) {
+    for (int c = 0; c < oc; c++) {
+      int c8div = c / size, c8mod = c % size;
+      int src_index = c8div * in_plane_stride + r * size + c8mod;
+      int dst_index = r * out_oc_stride + c;
+      int32_t value = in[src_index];
+      if (bias != NULL) {
+        value = in[src_index] + bias[c];
+      }
+      value = MultiplyByQuantizedMultiplier(value, multiplier, left_shift, right_shift) + zp;
+      value = MSMIN(maxi, value);
+      value = MSMAX(mini, value);
+      out[dst_index] = (int8_t)value;
+    }
+  }
+  return;
+}
+
+void PostFuncInt8C8(const int *in, const int *bias, int8_t *out, int oc, int plane, int32_t multiplier,
+                    int32_t left_shift, int32_t right_shift, int32_t zp, int8_t mini, int8_t maxi) {
+  /*  ((int32_t)row8x8-major + bias) * multiplier + output_zp  =>  (int8)relu  =>  (int8_t)row-major  */
+  PostConvFuncCommInt8(in, out, bias, oc, plane, oc, UP_ROUND(plane, C8NUM) * C8NUM, multiplier, mini, maxi, left_shift,
+                       right_shift, zp, C8NUM);
+  return;
+}
+
+void PostFuncInt8C4(const int *in, const int *bias, int8_t *out, int oc, int plane, int stride, int32_t multiplier,
+                    int32_t left_shift, int32_t right_shift, int32_t zp, int8_t mini, int8_t maxi) {
+  /*  ((int32_t)row4x4-major + bias) * multiplier + output_zp  =>  (int8)relu  =>  (int8_t)row-major  */
+  PostConvFuncCommInt8(in, out, bias, oc, plane, stride, UP_ROUND(plane, C4NUM) * C4NUM, multiplier, mini, maxi,
+                       left_shift, right_shift, zp, C4NUM);
+  return;
+}
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/common_func.h
@@ -27,6 +27,11 @@
 extern "C" {
 #endif

+void PostFuncInt8C8(const int *in, const int *bias, int8_t *out, int oc, int plane, int32_t multiplier,
+                    int32_t left_shift, int32_t right_shift, int32_t zp, int8_t mini, int8_t maxi);
+void PostFuncInt8C4(const int *in, const int *bias, int8_t *out, int oc, int plane, int stride, int32_t multiplier,
+                    int32_t left_shift, int32_t right_shift, int32_t zp, int8_t mini, int8_t maxi);
+
 #ifdef ENABLE_ARM
 void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8,
                               size_t oc4, size_t offset);

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/deconv.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/deconv.c
@@ -16,17 +16,10 @@

 #include "nnacl/int8/deconv.h"
 #include "nnacl/int8/matmul_int8.h"
-
-int DeConvInt8(const int8_t *input, const int8_t *weight, int32_t *output, size_t row8, size_t col8, size_t deep,
-               ConvParameter *conv_param) {
-  MatMulInt8(input, weight, output, row8, col8, deep, conv_param->conv_quant_arg_.input_quant_args_[0].zp_,
-             conv_param->conv_quant_arg_.filter_quant_args_[0].zp_);
-  return NNACL_OK;
-}
-
-int DeConvPostInt8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8_t *out, int output_channel,
-                   ConvParameter *conv_param) {
-  /* row8x8-major(ih*iw x oc*kh*kw)  ->  row8x8-major(oh*ow x oc) */
+#include "nnacl/int8/common_func.h"
+int DeConvPostInt8C8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8_t *out, int output_channel,
+                     ConvParameter *conv_param) {
+  /* row8x8-major(ih*iw x oc*kh*kw)  ->  row8-major(oh*ow x oc) */
  size_t input_plane = conv_param->input_w_ * conv_param->input_h_;
  size_t kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
  size_t output_plane = conv_param->output_w_ * conv_param->output_h_;
@@ -63,9 +56,161 @@ int DeConvPostInt8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8_t
    }       /*ih*/
  }         /*oc8*/

-  PostFuncInt8(tmp, bias, out, output_channel, output_plane, UP_ROUND(output_plane, 8),
-               conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0],
-               conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
-               conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]);
+  PostFuncInt8C8(tmp, bias, out, output_channel, output_plane, conv_param->conv_quant_arg_.quant_multiplier_[0],
+                 conv_param->conv_quant_arg_.left_shift_[0], conv_param->conv_quant_arg_.right_shift_[0],
+                 conv_param->conv_quant_arg_.output_quant_args_[0].zp_, conv_param->conv_quant_arg_.out_act_min_[0],
+                 conv_param->conv_quant_arg_.out_act_max_[0]);
  return NNACL_OK;
 }
+
+int DeConvPostInt8C4(const int32_t *src, const int32_t *bias, int32_t *tmp, int8_t *out, int output_channel,
+                     ConvParameter *conv_param) {
+  /* row4x4-major(ih*iw x oc*kh*kw)  ->  row4-major(oh*ow x oc) */
+  size_t input_plane = conv_param->input_w_ * conv_param->input_h_;
+  size_t kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
+  size_t output_plane = conv_param->output_w_ * conv_param->output_h_;
+  int oc4 = UP_DIV(output_channel, C4NUM);
+  int in_plane4 = UP_ROUND(input_plane, C4NUM);
+
+  int src_iw_stride = C4NUM;
+  int src_ih_stride = conv_param->input_w_ * C4NUM;
+  int src_kw_stride = input_plane * C4NUM;
+  int src_kh_stride = input_plane * conv_param->kernel_w_ * C4NUM;
+  int dst_oh_stride = conv_param->output_w_ * C4NUM;
+  int dst_ow_stride = C4NUM;
+  int dst_kh_stride = conv_param->dilation_h_ * conv_param->output_w_ * C4NUM;
+  int dst_kw_stride = conv_param->dilation_w_ * C4NUM;
+
+  for (int c = 0; c < oc4; c++) {
+    int32_t *dst_ptr = tmp + c * output_plane * C4NUM;
+    const int32_t *src_ptr = src + c * in_plane4 * kernel_plane * C4NUM;
+    memset(dst_ptr, 0, output_plane * C4NUM * sizeof(int32_t));
+
+    for (int ih = 0; ih < conv_param->input_h_; ih++) {
+      for (int iw = 0; iw < conv_param->input_w_; iw++) {
+        int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
+        int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
+
+        int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
+        int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
+        int kw_start = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
+        int kw_end = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
+        for (int kh = kh_start; kh < kh_end; kh++) {
+          for (int kw = kw_start; kw < kw_end; kw++) {
+            int src_index = ih * src_ih_stride + iw * src_iw_stride + kh * src_kh_stride + kw * src_kw_stride;
+            int dst_index = oh * dst_oh_stride + ow * dst_ow_stride + kh * dst_kh_stride + kw * dst_kw_stride;
+            int32_t *tmp_dst = dst_ptr + dst_index;
+            const int32_t *tmp_src = src_ptr + src_index;
+#ifndef ENABLE_ARM64
+            for (int i = 0; i < C4NUM; i++) {
+              tmp_dst[i] += tmp_src[i];
+            }
+#else
+            asm volatile(
+              "mov x0, %[tmp_src] \n"
+              "mov x1, %[tmp_dst] \n"
+
+              "ld1 {v0.4s}, [x0] \n"
+              "ld1 {v1.4s}, [x1] \n"
+
+              "add v0.4s, v0.4s, v1.4s \n"
+
+              "st1 {v0.4s}, [x1] \n"
+
+              :
+              : [ tmp_src ] "r"(tmp_src), [ tmp_dst ] "r"(tmp_dst)
+              : "x0", "x1", "v0", "v1");
+#endif
+          } /*kw*/
+        }   /*kh*/
+      }     /*iw*/
+    }       /*ih*/
+  }         /*oc*/
+
+  PostFuncInt8C4(tmp, bias, out, output_channel, output_plane, conv_param->output_channel_,
+                 conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0],
+                 conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
+                 conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]);
+  return NNACL_OK;
+}
+
+void DeConvWeightTransInt8(int8_t *src, int8_t *dst, int input_channel, int output_channel, int plane,
+                           bool support_optimize_) {
+  if (support_optimize_) {
+    int ic16 = UP_ROUND(input_channel, C16NUM);
+    int oc4 = UP_ROUND(output_channel, C4NUM);
+    for (int ic = 0; ic < input_channel; ic++) {
+      int ic16div = ic / C16NUM, ic16mod = ic % C16NUM;
+      for (int oc = 0; oc < output_channel; oc++) {
+        int oc4div = oc / C4NUM, oc4mod = oc % C4NUM;
+        for (int hw = 0; hw < plane; hw++) {
+          int src_index = ic * output_channel * plane + hw * output_channel + oc;
+          int dst_index =
+            hw * ic16 * oc4 + oc4div * ic16 * C4NUM + ic16div * C16NUM * C4NUM + oc4mod * C16NUM + ic16mod;
+          dst[dst_index] = src[src_index];
+        }
+      }
+    }
+  } else {
+    /* todo normal int8 deconv */
+  }
+  return;
+}
+
+void DeConvPackWeightSum(int8_t *weight, int32_t *weight_sum, int32_t input_zp, int32_t filter_zp, int deep16, int col4,
+                         bool suppport_opt) {
+  if (suppport_opt) {
+    for (int c = 0; c < col4; c++) {
+      int c4div = c / C4NUM, c4mod = c % C4NUM;
+      int32_t value = 0;
+      for (int r = 0; r < deep16; r++) {
+        int r16div = r / 16, r16mod = r % 16;
+        int src_index = c4div * deep16 * C4NUM + r16div * C4NUM * C16NUM + c4mod * C16NUM + r16mod;
+        value += weight[src_index];
+      }
+      weight_sum[c] = filter_zp * input_zp * deep16 - value * input_zp;
+    }
+  } else {
+    /* todo normal int8 deconv */
+  }
+  return;
+}
+
+void DeConvPackInputSum(const int8_t *src, int32_t *dst, int32_t filter_zp, int row4, int col16, bool suppport_opt) {
+  if (suppport_opt) {
+    for (int r = 0; r < row4; r++) {
+      int32_t tmp_value = 0;
+      for (int c = 0; c < col16; c++) {
+        int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM;
+        int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod;
+        tmp_value += src[src_index];
+      }
+      dst[r] = tmp_value * filter_zp;
+    }
+  } else {
+    /* todo normal int8 deconv */
+  }
+  return;
+}
+
+int DeConvInt8(const int8_t *input, const int8_t *weight, int32_t *output, int32_t *weight_sum, int32_t *input_sum,
+               size_t act_row, size_t act_col, size_t act_deep, ConvParameter *conv_param,
+               MATMUL_OPT_R4_FUNC matmul_func) {
+  if (matmul_func != NULL) {
+    matmul_func(output, input, weight, weight_sum, input_sum, act_row, act_col, act_deep);
+  } else {
+    /* todo normal int8 deconv */
+  }
+  return NNACL_OK;
+}
+
+int DeConvPostInt8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8_t *out, int output_channel,
+                   ConvParameter *conv_param, bool support_optimize) {
+  int error_code = NNACL_OK;
+  if (support_optimize) {
+    error_code = DeConvPostInt8C4(src, bias, tmp, out, output_channel, conv_param);
+  } else {
+    /* todo normal int8 deconv post */
+  }
+  return error_code;
+}
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/deconv.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/deconv.h
@@ -22,16 +22,22 @@
 #include "nnacl/errorcode.h"
 #include "nnacl/conv_parameter.h"
 #include "nnacl/common_func.h"
+#include "nnacl/int8/matmul_int8.h"

 #ifdef __cplusplus
 extern "C" {
 #endif
+void DeConvPackWeightSum(int8_t *weight, int32_t *weight_sum, int32_t input_zp, int32_t filter_zp, int deep16, int col4,
+                         bool suppport_opt);
+void DeConvPackInputSum(const int8_t *src, int32_t *dst, int32_t filter_zp, int row4, int col16, bool suppport_opt);
+void DeConvWeightTransInt8(int8_t *src, int8_t *dst, int input_channel, int output_channel, int plane,
+                           bool support_optimize_);

-int DeConvInt8(const int8_t *input, const int8_t *weight, int32_t *output, size_t row8, size_t col8, size_t deep,
-               ConvParameter *conv_param);
-
+int DeConvInt8(const int8_t *input, const int8_t *weight, int32_t *output, int32_t *weight_sum, int32_t *input_sum,
+               size_t act_row, size_t act_col, size_t act_deep, ConvParameter *conv_param,
+               MATMUL_OPT_R4_FUNC matmul_func);
 int DeConvPostInt8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8_t *out, int output_channel,
-                   ConvParameter *conv_param);
+                   ConvParameter *conv_param, bool support_optimize);
 #ifdef __cplusplus
 }
 #endif

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/matmul_int8.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/matmul_int8.c
@@ -17,7 +17,6 @@
 #include "nnacl/int8/matmul_int8.h"
 #include <limits.h>
 #include "nnacl/quantization/fixed_point.h"
-
 void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
  for (int r = 0; r < row; r++) {
    int8_t *src = src_ptr + r * col;
@@ -29,6 +28,23 @@ void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col)
  }
 }

+void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col) {
+  /* Row-major to row16x4-major (block row-major) */
+  int col16 = UP_ROUND(col, C16NUM);
+  for (int r = 0; r < row; r++) {
+    int r4div = r / C4NUM;
+    int r4mod = r % C4NUM;
+    for (int c = 0; c < col; c++) {
+      int c16div = c / C16NUM;
+      int c16mod = c % C16NUM;
+      int src_index = r * col + c;
+      int dst_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod;
+      ((int8_t *)dst_ptr)[dst_index] = ((int8_t *)src_ptr)[src_index];
+    }
+  }
+  return;
+}
+
 void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
  for (int r = 0; r < row; r++) {
    int rd8 = r / 8;
@@ -57,3 +73,26 @@ void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, co
    }
  }
 }
+
+void MatMulOptR4Int8(int32_t *dst, const int8_t *a, const int8_t *b, const int32_t *bias, const int32_t *input_sum,
+                     size_t row_4, size_t col_4, size_t deep_16) {
+  /*  row4x16-major * row16x4-major => row4x4-major  */
+  for (int r = 0; r < row_4; r++) {
+    for (int c = 0; c < col_4; c++) {
+      int r4div = r / C4NUM, r4mod = r % C4NUM;
+      int c4div = c / C4NUM, c4mod = c % C4NUM;
+      size_t ci = c4div * row_4 * C4NUM + r * C4NUM + c4mod;
+      int32_t value = 0;
+      for (int d = 0; d < deep_16; d++) {
+        int d16div = d / C16NUM, d16mod = d % C16NUM;
+        size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
+        size_t bi = c4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod;
+        value = value + a[ai] * b[bi];
+      }
+      value -= input_sum[r];
+      value += bias[c];
+      ((int32_t *)dst)[ci] = value;
+    }
+  }
+  return;
+}
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/matmul_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/matmul_int8.h
@@ -18,14 +18,19 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_MATMUL_H_

 #include "nnacl/op_base.h"
+#include "nnacl/matmul_parameter.h"

 #ifdef __cplusplus
 extern "C" {
 #endif
 void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, const int col8, const int deep,
                const int32_t a_zp, const int32_t b_zp);
+void MatMulOptR4Int8(int32_t *dst, const int8_t *a, const int8_t *b, const int32_t *bias, const int32_t *input_sum,
+                     size_t row_4, size_t col_4, size_t deep_16);
+
 void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
+void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col);
 #ifdef __cplusplus
 }
 #endif

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/matmul_parameter.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/matmul_parameter.h
@@ -19,6 +19,11 @@

 #include "nnacl/op_base.h"

+typedef void (*MATMUL_OPT_R4_FUNC)(int32_t *dst, const int8_t *a, const int8_t *b, const int32_t *bias,
+                                   const int32_t *input_sum, size_t row_4, size_t col_4, size_t deep_16);
+
+typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col);
+
 typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType;

 typedef struct MatMulParameter {

--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc
@@ -36,15 +36,6 @@ class TestDeconvInt8 : public mindspore::CommonTest {
  TestDeconvInt8() {}
 };

-void FloatToInt8(float *fptr, int8_t *iptr, size_t size, int32_t zp, double scale) {
-  for (int i = 0; i < size; i++) {
-    int32_t value = round(fptr[i] / scale + zp);
-    value = MSMIN(value, INT8_MAX);
-    value = MSMAX(value, INT8_MIN);
-    iptr[i] = (int8_t)value;
-  }
-}
-
 TEST_F(TestDeconvInt8, PackWeight1) {
  int8_t in[] = {-8, 11,   99,  -80, 8,    -12, 37,  -45, 31,   -69, -66, 26,  112, 124, -109, 85,  -24, 28,  -46, 100,
                 72, -36,  -82, 64,  -110, 37,  -72, 65,  -124, 91,  -43, 99,  3,   100, 19,   51,  -14, -81, 67,  90,
@@ -164,6 +155,125 @@ TEST_F(TestDeconvInt8, MatMulTest1) {
  CompareOutputData(out_row_major, co_row_major_10_18, 180, 1);
 }

+TEST_F(TestDeconvInt8, MatMulOptTest1) {
+  int8_t a_src_ptr[] = {-6, 76,  32,  80,  -73, 8,    -85, -3,  114, 80,  30,  42,  -41, 117, 62,  -76, -77, -111,
+                        88, 105, 68,  105, -74, 13,   51,  94,  31,  -52, -92, -4,  -35, -71, 101, -93, 46,  -65,
+                        57, -41, -51, 77,  1,   9,    73,  -19, -36, 57,  81,  -24, 40,  103, 112, 109, -41, -68,
+                        57, 61,  55,  -20, 3,   2,    17,  -16, -31, 58,  -4,  67,  -4,  -95, -5,  -72, 81,  15,
+                        -7, -16, -47, 112, 114, -26,  -98, 53,  15,  -49, 26,  19,  19,  8,   -57, -35, -79, 118,
+                        29, 21,  37,  -48, 83,  7,    124, 113, -5,  15,  -8,  107, -65, -88, 50,  -47, -80, -84,
+                        3,  -45, 92,  42,  -20, -101, 106, -10, 89,  67,  55,  10};
+  int32_t input_zp = 15;
+  int8_t b_src_ptr[] = {
+    92,  27,   22,   52,  -112, -20, -57, -2,   89,   32,  93,   -66,  -25, -54, 94,  -97, -119, -98,  101,  -99,
+    77,  -83,  76,   95,  59,   97,  8,   40,   -109, -20, 67,   -107, 37,  -6,  -54, -20, -30,  36,   -106, -103,
+    -3,  -86,  -82,  59,  4,    -75, -50, -106, 55,   104, -117, -71,  -20, -85, -77, 16,  -25,  -58,  4,    80,
+    -75, 94,   32,   -68, 2,    40,  56,  -103, 11,   -98, -70,  -69,  0,   57,  -6,  82,  66,   -112, -61,  33,
+    -77, -53,  95,   -38, 87,   -46, -3,  81,   -47,  43,  21,   26,   -45, -57, 50,  -24, -82,  -114, 61,   46,
+    -53, 78,   -24,  31,  -7,   37,  29,  38,   45,   106, 52,   -42,  31,  -6,  -61, -87, 2,    79,   -5,   -42,
+    43,  -106, -104, 7,   91,   -63, 58,  97,   -15,  74,  -96,  15,   -23, -3,  -47, -97, 100,  -54,  26,   -46,
+    35,  26,   100,  -80, 34,   -25, 96,  -67,  -80,  -27, 66,   41,   41,  -43, -43, -38, -4,   -64,  31,   7,
+    -8,  6,    -2,   39,  -119, 53,  75,  -91,  -44,  77,  -62,  22,   -44, 78,  -67, -48, -115, -4,   43,   81,
+    40,  -20,  -5,   -89, 60,   -62, -4,  -48,  66,   -64, -69,  62,   17,  -89, 1,   87,  81,   32,   -29,  51,
+    40,  27,   66,   67,  11,   -69, 85,  -79,  -106, 55,  22,   -23,  62,  69,  -74, 49};
+  int32_t filter_zp = -20;
+
+  /*
+   * ----------------------   pack input  ------------------------- */
+  int8_t packed_a[12 * 16] = {0};
+  memset(packed_a, static_cast<int8_t>(input_zp), 12 * 16);
+  int8_t correct_packed_a[] = {
+    -6,  76,  32,  80,  -73, 8,   -85, -3,  114, 80,  30,  42,  15,  15,  15,  15,  -41, 117,  62,  -76, -77, -111,
+    88,  105, 68,  105, -74, 13,  15,  15,  15,  15,  51,  94,  31,  -52, -92, -4,  -35, -71,  101, -93, 46,  -65,
+    15,  15,  15,  15,  57,  -41, -51, 77,  1,   9,   73,  -19, -36, 57,  81,  -24, 15,  15,   15,  15,  40,  103,
+    112, 109, -41, -68, 57,  61,  55,  -20, 3,   2,   15,  15,  15,  15,  17,  -16, -31, 58,   -4,  67,  -4,  -95,
+    -5,  -72, 81,  15,  15,  15,  15,  15,  -7,  -16, -47, 112, 114, -26, -98, 53,  15,  -49,  26,  19,  15,  15,
+    15,  15,  19,  8,   -57, -35, -79, 118, 29,  21,  37,  -48, 83,  7,   15,  15,  15,  15,   124, 113, -5,  15,
+    -8,  107, -65, -88, 50,  -47, -80, -84, 15,  15,  15,  15,  3,   -45, 92,  42,  -20, -101, 106, -10, 89,  67,
+    55,  10,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,   15,  15,  15,  15,
+    15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,
+  };
+  RowMajor2Row16x4MajorInt8(a_src_ptr, packed_a, 10, 12);
+  CompareOutputData(packed_a, correct_packed_a, 16 * 12, 0);
+
+  /*
+   * ----------------------   pack weight  ------------------------- */
+  int8_t packed_b[16 * 3 * 8] = {0};
+  memset(packed_b, static_cast<int8_t>(filter_zp), 16 * 3 * 8);
+  int8_t correct_packed_b[] = {
+    92,   101,  -30,  -77,  0,    21,   45,  58,  34,   -2,  40,  -29, -20, -20,  -20,  -20, 27,  -99,  36,  16,  57,
+    26,   106,  97,   -25,  39,   -20,  51,  -20, -20,  -20, -20, 22,  77,  -106, -25,  -6,  -45, 52,   -15, 96,  -119,
+    -5,   40,   -20,  -20,  -20,  -20,  52,  -83, -103, -58, 82,  -57, -42, 74,   -67,  53,  -89, 27,   -20, -20, -20,
+    -20,  -112, 76,   -3,   4,    66,   50,  31,  -96,  -80, 75,  60,  66,  -20,  -20,  -20, -20, -20,  95,  -86, 80,
+    -112, -24,  -6,   15,   -27,  -91,  -62, 67,  -20,  -20, -20, -20, -20, -20,  -20,  -20, -20, -20,  -20, -20, -20,
+    -20,  -20,  -20,  -20,  -20,  -20,  -20, -20, -20,  -20, -20, -20, -20, -20,  -20,  -20, -20, -20,  -20, -20, -20,
+    -20,  -20,  -57,  59,   -82,  -75,  -61, -82, -61,  -23, 66,  -44, -4,  11,   -20,  -20, -20, -20,  -2,  97,  59,
+    94,   33,   -114, -87,  -3,   41,   77,  -48, -69,  -20, -20, -20, -20, 89,   8,    4,   32,  -77,  61,  2,   -47,
+    41,   -62,  66,   85,   -20,  -20,  -20, -20, 32,   40,  -75, -68, -53, 46,   79,   -97, -43, 22,   -64, -79, -20,
+    -20,  -20,  -20,  93,   -109, -50,  2,   95,  -53,  -5,  100, -43, -44, -69,  -106, -20, -20, -20,  -20, -66, -20,
+    -106, 40,   -38,  78,   -42,  -54,  -38, 78,  62,   55,  -20, -20, -20, -20,  -20,  -20, -20, -20,  -20, -20, -20,
+    -20,  -20,  -20,  -20,  -20,  -20,  -20, -20, -20,  -20, -20, -20, -20, -20,  -20,  -20, -20, -20,  -20, -20, -20,
+    -20,  -20,  -20,  -20,  -25,  67,   55,  56,  87,   -24, 43,  26,  -4,  -67,  17,   22,  -20, -20,  -20, -20, -54,
+    -107, 104,  -103, -46,  31,   -106, -46, -64, -48,  -89, -23, -20, -20, -20,  -20,  94,  37,  -117, 11,  -3,  -7,
+    -104, 35,   31,   -115, 1,    62,   -20, -20, -20,  -20, -97, -6,  -71, -98,  81,   37,  7,   26,   7,   -4,  87,
+    69,   -20,  -20,  -20,  -20,  -119, -54, -20, -70,  -47, 29,  91,  100, -8,   43,   81,  -74, -20,  -20, -20, -20,
+    -98,  -20,  -85,  -69,  43,   38,   -63, -80, 6,    81,  32,  49,  -20, -20,  -20,  -20, -20, -20,  -20, -20, -20,
+    -20,  -20,  -20,  -20,  -20,  -20,  -20, -20, -20,  -20, -20, -20, -20, -20,  -20,  -20, -20, -20,  -20, -20, -20,
+    -20,  -20,  -20,  -20,  -20,  -20};
+  DeConvWeightTransInt8(b_src_ptr, packed_b, 12, 6, 3, true);
+  /* kernel : 12x1x3x6   nhwc   */
+  CompareOutputData(packed_b, correct_packed_b, 16 * 3 * 8, 0);
+
+  /*
+   * ----------------------   calculate input_sum   ------------------------- */
+  int32_t input_sum[12] = {0};
+  int32_t correct_input_sum[] = {-7100, -4780, 580, -4880, -9460, -1420, -3120, -3260, -1840, -6960, -4800, -4800};
+  DeConvPackInputSum(packed_a, input_sum, filter_zp, 12, 16, true);
+  CompareOutputData(input_sum, correct_input_sum, 12, 0);
+
+  for (int i = 0; i < 12; i++) {
+    if (input_sum[i] != correct_input_sum[i]) {
+      printf("%d %d %d\n", i, input_sum[i], correct_input_sum[i]);
+    }
+  }
+
+  /*
+   * ----------------------   calculate weight_sum   ------------------------- */
+  int32_t weight_sum[3 * 8] = {0};
+  int32_t correct_weight_sum[] = {-7395, -8265, -3090, -435, -5655, -1035, 0,     0,     1695,  -4770, -6630, 300,
+                                  -765,  -2835, 0,     0,    -7395, 4665,  -2475, -4170, -2880, -1110, 0,     0};
+  DeConvPackWeightSum(packed_b, weight_sum, input_zp, filter_zp, 16, 24, true);
+  CompareOutputData(weight_sum, correct_weight_sum, 3 * 8, 0);
+
+  /*
+   * ----------------------   do matmul   ------------------------- */
+  int32_t tmp_output[12 * 24] = {0};
+  int32_t correct_tmp_output[] = {
+    -1624,  -19061, 1795,   -17119, 14706,  417,    7306,   1357,   9653,   -44022, 19414,  -36187, -2041,  6874,
+    -5766,  3072,   9842,   2395,   12464,  -18826, -12267, -17853, 4617,   -19468, -15734, -6112,  2122,   14259,
+    11098,  -9520,  12407,  -15239, 10309,  -34271, 9740,   -14607, -5027,  12313,  -508,   -10808, 0,      0,
+    0,      0,      0,      0,      0,      0,      1604,   14898,  0,      0,      -8212,  9471,   0,      0,
+    -23430, 6343,   0,      0,      4020,   -3740,  0,      0,      -9730,  22378,  0,      0,      4702,   4740,
+    0,      0,      -7541,  5461,   0,      0,      -6633,  8356,   0,      0,      -16854, 9147,   0,      0,
+    -4018,  -11524, 0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      17194,  28501,
+    13376,  -9359,  21454,  22425,  -21049, 6603,   23479,  -658,   12866,  9739,   -12173, -7558,  3862,   10238,
+    4110,   31945,  10069,  -7376,  -1948,  -20322, 16439,  3260,   1712,   12743,  -8132,  -27744, 7633,   -33916,
+    18755,  11300,  3686,   9222,   10103,  26102,  17,     13135,  785,    -6305,  0,      0,      0,      0,
+    0,      0,      0,      0,      -27325, 14957,  0,      0,      -12191, -21866, 0,      0,      -21690, -18554,
+    0,      0,      8737,   14529,  0,      0,      -1774,  -19575, 0,      0,      -12761, 13286,  0,      0,
+    20523,  2488,   0,      0,      -12782, 12688,  0,      0,      -1194,  -10523, 0,      0,      -4044,  -9671,
+    0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      -4671,  -4173,  8675,   -8560,
+    -1597,  -4946,  -20214, -6752,  -11439, 5138,   11119,  -17661, -6690,  -17301, -5541,  -4356,  22347,  -11778,
+    2389,   -22030, -5176,  -242,   8786,   -994,   9104,   -7208,  24117,  3724,   -13648, -1840,  12265,  10347,
+    -10325, 7184,   19374,  -29001, 3979,   -6704,  -23278, -8124,  0,      0,      0,      0,      0,      0,
+    0,      0,      -9132,  8560,   0,      0,      19264,  -10169, 0,      0,      -15133, -13678, 0,      0,
+    7894,   -51,    0,      0,      -4775,  -29785, 0,      0,      -12597, 4088,   0,      0,      -17420, 1815,
+    0,      0,      15796,  3101,   0,      0,      -37969, -10818, 0,      0,      12714,  -7827,  0,      0,
+    0,      0,      0,      0,      0,      0,      0,      0};
+  MatMulOptR4Int8(tmp_output, packed_a, packed_b, weight_sum, input_sum, 12, 24, 16);
+  CompareOutputData(tmp_output, correct_tmp_output, 12 * 3 * 8, 0);
+}
+
 TEST_F(TestDeconvInt8, PostAddTest1) {
  int32_t in[] = {
    -4956,  -3923,  868,   -8880, -4089, -5179, -4526, -4527, -10464, 99,    -5826, -2995, -4519, -4519, -10509, -2505,
@@ -185,17 +295,17 @@ TEST_F(TestDeconvInt8, PostAddTest1) {
  int32_t right_shift;
  QuantizeRoundParameter(multiplier, &quant_multiplier, &left_shift, &right_shift);
  int32_t zp = 83;
-  PostFuncInt8(in, bias, out, 10, 5, 8, quant_multiplier, left_shift, right_shift, zp, -128, 127);
+  PostFuncInt8C8(in, bias, out, 10, 5, quant_multiplier, left_shift, right_shift, zp, -128, 127);
  CompareOutputData(out, co, 50, 1);

  int8_t co_relu[] = {0, 11, 99, 0, 8, 0, 0, 0,  112, 124, 0,   85, 0,   28, 0, 0,  0, 37, 0, 65, 0,  91, 0, 0, 0,
                      0, 67, 90, 4, 0, 0, 0, 47, 0,   114, 125, 0,  100, 0,  0, 37, 0, 31, 0, 0,  26, 0,  0, 0, 100};
-  PostFuncInt8(in, bias, out, 10, 5, 8, quant_multiplier, left_shift, right_shift, zp, 0, 127);
+  PostFuncInt8C8(in, bias, out, 10, 5, quant_multiplier, left_shift, right_shift, zp, 0, 127);
  CompareOutputData(out, co_relu, 50, 1);

  int8_t co_relu6[] = {0, 6, 6, 0, 6, 0, 0, 0, 6, 6, 0, 6, 0, 6, 0, 0, 0, 6, 0, 6, 0, 6, 0, 0, 0,
                       0, 6, 6, 4, 0, 0, 0, 6, 0, 6, 6, 0, 6, 0, 0, 6, 0, 6, 0, 0, 6, 0, 0, 0, 6};
-  PostFuncInt8(in, bias, out, 10, 5, 8, quant_multiplier, left_shift, right_shift, zp, 0, 6);
+  PostFuncInt8C8(in, bias, out, 10, 5, quant_multiplier, left_shift, right_shift, zp, 0, 6);
  CompareOutputData(out, co_relu6, 50, 1);
 }

@@ -247,7 +357,7 @@ TEST_F(TestDeconvInt8, DeConvInt8Test1) {
  std::vector<lite::tensor::Tensor *> outputs_;
  auto deconv_param = new ConvParameter();
  lite::Context *ctx = new lite::Context;
-  ctx->thread_num_ = 2;
+  ctx->thread_num_ = 1;
  int8_t *correct;
  int total_size = DeConvInt8TestInit1(&inputs_, &outputs_, deconv_param, &correct);
  mindspore::kernel::DeConvInt8CPUKernel *deconv = new mindspore::kernel::DeConvInt8CPUKernel(