!3799 conv1x1 & deconv change

Merge pull request !3799 from ling/conv1x1

!3799 conv1x1 & deconv change
Merge pull request !3799 from ling/conv1x1
8beb1b0f · mindspore-ci-bot · Gitee · 8ff7c0b6 · fa86096f · 8beb1b0f
31 changed file
--- a/mindspore/lite/src/populate_parameter.cc
+++ b/mindspore/lite/src/populate_parameter.cc
@@ -165,8 +165,7 @@ OpParameter *PopulateFullconnectionParameter(const lite::Primitive *primitive) {
  matmul_param->b_transpose_ = true;
  matmul_param->a_transpose_ = false;
  matmul_param->has_bias_ = param->hasBias();
-  matmul_param->minf_ = -FLT_MAX;
-  matmul_param->maxf_ = FLT_MAX;
+  matmul_param->act_type_ = ActType_No;
  return reinterpret_cast<OpParameter *>(matmul_param);
 }

@@ -181,8 +180,7 @@ OpParameter *PopulateMatMulParameter(const lite::Primitive *primitive) {
  matmul_param->b_transpose_ = param->transposeB();
  matmul_param->a_transpose_ = param->transposeA();
  matmul_param->has_bias_ = false;
-  matmul_param->minf_ = -FLT_MAX;
-  matmul_param->maxf_ = FLT_MAX;
+  matmul_param->act_type_ = ActType_No;
  return reinterpret_cast<OpParameter *>(matmul_param);
 }


--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
@@ -146,28 +146,10 @@ int ConvolutionBaseCPUKernel::SetQuantParam() {
  QuantizeRoundParameter(real_multiplier, &conv_quant_arg_->quant_multiplier_[0], &conv_quant_arg_->left_shift_[0],
                         &conv_quant_arg_->right_shift_[0]);

-  ComputeQuantOutRange(conv_param_);
+  CalculateActivationRangeQuantized(
+    conv_param_->is_relu_, conv_param_->is_relu6_, conv_param_->conv_quant_arg_.quant_args_[2][0].zp_,
+    conv_param_->conv_quant_arg_.quant_args_[2][0].scale_, &conv_param_->conv_quant_arg_.out_act_min_[0],
+    &conv_param_->conv_quant_arg_.out_act_max_[0]);
  return RET_OK;
 }
-
-void ComputeQuantOutRange(ConvParameter *conv_param) {
-  int32_t min = std::numeric_limits<int8_t>::min();
-  int32_t max = std::numeric_limits<int8_t>::max();
-  float scale = conv_param->conv_quant_arg_.quant_args_[2][0].scale_;
-  int32_t zp = conv_param->conv_quant_arg_.quant_args_[2][0].zp_;
-  bool is_relu = conv_param->is_relu_;
-  bool is_relu6 = conv_param->is_relu6_;
-  int32_t quantized_zero = QuantizeToInt8(0, scale, zp);
-  int32_t quantized_six = QuantizeToInt8(6, scale, zp);
-  if (is_relu) {
-    min = min > quantized_zero ? min : quantized_zero;
-  } else if (is_relu6) {
-    min = min > quantized_zero ? min : quantized_zero;
-    max = max < quantized_six ? max : quantized_six;
-  } else {
-    // do nothing
-  }
-  conv_param->conv_quant_arg_.out_act_min_[0] = min;
-  conv_param->conv_quant_arg_.out_act_max_[0] = max;
-}
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h
@@ -38,7 +38,7 @@ class ConvolutionBaseCPUKernel : public LiteKernel {
 public:
  ConvolutionBaseCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                           const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
-    : LiteKernel(parameter, inputs, outputs), ctx_(ctx), thread_count_(ctx->threadNum) {
+      : LiteKernel(parameter, inputs, outputs), ctx_(ctx), thread_count_(ctx->threadNum) {
    opParameter->thread_num_ = ctx->threadNum;
    conv_param_ = reinterpret_cast<ConvParameter *>(opParameter);
  }
@@ -60,7 +60,6 @@ class ConvolutionBaseCPUKernel : public LiteKernel {
  ConvParameter *conv_param_;
  LayoutConvertor convert_func_;
 };
-void ComputeQuantOutRange(ConvParameter *conv_param);
 bool CheckSupportFP16();
 }  // namespace mindspore::kernel


--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
@@ -23,62 +23,71 @@ using mindspore::lite::RET_OK;

 namespace mindspore::kernel {
 Convolution1x1CPUKernel::~Convolution1x1CPUKernel() {
-  if (c4_output_ != nullptr) {
-    free(c4_output_);
-    c4_output_ = nullptr;
+  if (weight_ptr_ != nullptr) {
+    free(weight_ptr_);
+    weight_ptr_ = nullptr;
  }
-  if (c4_input_ != nullptr) {
-    free(c4_input_);
-    c4_input_ = nullptr;
+  if (pack_input_ != nullptr) {
+    free(pack_input_);
+    pack_input_ = nullptr;
  }
-  if (pre_trans_input_) {
+  if (pack_output_ != nullptr) {
+    free(pack_output_);
+    pack_output_ = nullptr;
+  }
+  if (pre_trans_input_ && input_ptr_ != nullptr) {
    free(input_ptr_);
    input_ptr_ = nullptr;
  }
-  if (tmp_ptr_ != nullptr) {
-    free(tmp_ptr_);
-    tmp_ptr_ = nullptr;
-  }
-  if (weight_ptr_ != nullptr) {
-    free(weight_ptr_);
-    weight_ptr_ = nullptr;
-  }
  delete matmul_param_;
 }

-int Convolution1x1CPUKernel::ReSize() { return RET_OK; }
+int Convolution1x1CPUKernel::ReSize() {
+  if (pack_input_ != nullptr) {
+    free(pack_input_);
+    pack_input_ = nullptr;
+  }
+  if (pre_trans_input_ && input_ptr_ != nullptr) {
+    free(input_ptr_);
+    input_ptr_ = nullptr;
+  }
+  InitConv1x1MatmulParam();
+  InitConv1x1Param();
+  return RET_OK;
+}

 void Convolution1x1CPUKernel::InitConv1x1MatmulParam() {
-  matmul_param_ = new StrassenMatMulParameter();
  matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
-  matmul_param_->col_ = UP_DIV(conv_param_->output_channel_, FP32_STRASSEN_UINT);
-  matmul_param_->deep_ = UP_DIV(conv_param_->input_channel_, FP32_STRASSEN_UINT);
-  matmul_param_->a_stride_ = matmul_param_->row_ * FP32_STRASSEN_UINT;
-  matmul_param_->b_stride_ = matmul_param_->deep_ * FP32_STRASSEN_WEIGHT_UINT;
-  matmul_param_->c_stride_ = matmul_param_->row_ * FP32_STRASSEN_UINT;
+  matmul_param_->col_ = conv_param_->output_channel_;
+  matmul_param_->deep_ = conv_param_->input_channel_;
+  matmul_param_->row_8_ = UP_ROUND(matmul_param_->row_, C8NUM);
+  matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
+  matmul_param_->act_type_ = (conv_param_->is_relu6_) ? ActType_Relu6 : ActType_No;
+  matmul_param_->act_type_ = (conv_param_->is_relu_) ? ActType_Relu : matmul_param_->act_type_;
+  return;
 }

 int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
  if (inputs_.size() == 3) {
-    bias_data_ = malloc(matmul_param_->col_ * C4NUM * sizeof(float));
+    bias_data_ = malloc(matmul_param_->col_8_ * sizeof(float));
    if (bias_data_ == nullptr) {
      MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
      return RET_ERROR;
    }
-    memset(bias_data_, 0, matmul_param_->col_ * C4NUM * sizeof(float));
+    memset(bias_data_, 0, matmul_param_->col_8_ * sizeof(float));
    memcpy(bias_data_, inputs_[2]->Data(), conv_param_->output_channel_ * sizeof(float));
  } else {
    bias_data_ = nullptr;
  }

-  weight_ptr_ = reinterpret_cast<float *>(
-    malloc(matmul_param_->col_ * matmul_param_->deep_ * FP32_STRASSEN_WEIGHT_UINT * sizeof(float)));
+  weight_ptr_ = reinterpret_cast<float *>(malloc(matmul_param_->row_8_ * matmul_param_->col_8_ * sizeof(float)));
  if (weight_ptr_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
    return RET_ERROR;
  }
-  memset(weight_ptr_, 0, matmul_param_->col_ * matmul_param_->deep_ * FP32_STRASSEN_WEIGHT_UINT * sizeof(float));
-  Pack1x1WeightFp32(reinterpret_cast<float *>(inputs_[1]->Data()), weight_ptr_, conv_param_);
+  memset(weight_ptr_, 0, matmul_param_->row_8_ * matmul_param_->col_8_ * sizeof(float));
+  RowMajor2Col8Major(reinterpret_cast<float *>(inputs_[1]->Data()), weight_ptr_, matmul_param_->col_,
+                     matmul_param_->deep_);
  return RET_OK;
 }

@@ -86,52 +95,43 @@ int Convolution1x1CPUKernel::InitConv1x1Param() {
  pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
                      conv_param_->stride_w_ != 1);
  if (pre_trans_input_) {
-    input_ptr_ = reinterpret_cast<float *>(malloc(matmul_param_->a_stride_ * matmul_param_->deep_ * sizeof(float)));
+    input_ptr_ = reinterpret_cast<float *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float)));
    if (input_ptr_ == nullptr) {
      MS_LOG(ERROR) << "Conv1x1 Malloc input_ptr_ error!";
      return RET_MEMORY_FAILED;
    }
-    memset(input_ptr_, 0, matmul_param_->a_stride_ * matmul_param_->deep_ * sizeof(float));
+    memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(float));
  }

-  thread_hw_count_ = MSMIN(opParameter->thread_num_, matmul_param_->row_);
-  thread_hw_stride_ = UP_DIV(matmul_param_->row_, thread_hw_count_);
-
-  thread_oc4_count_ = MSMIN(opParameter->thread_num_, matmul_param_->col_);
-  thread_oc_stride_ = UP_DIV(matmul_param_->col_, thread_oc4_count_) * C4NUM;
+  thread_count_ = MSMIN(opParameter->thread_num_, UP_DIV(matmul_param_->col_, C8NUM));
+  thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM;

-  tmp_ptr_ = reinterpret_cast<float *>(malloc(matmul_param_->a_stride_ * matmul_param_->deep_ * sizeof(float)));
-  if (tmp_ptr_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc tmp_ptr_ error!";
-    return RET_MEMORY_FAILED;
-  }
-  c4_output_ =
-    reinterpret_cast<float *>(malloc(outputs_[0]->ElementsC4Num() / conv_param_->output_batch_ * sizeof(float)));
-  if (c4_output_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc c4_output_ error!";
+  pack_input_ = reinterpret_cast<float *>(malloc(matmul_param_->row_8_ * matmul_param_->deep_ * sizeof(float)));
+  if (pack_input_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
    return RET_MEMORY_FAILED;
  }
+  memset(pack_input_, 0, matmul_param_->row_8_ * matmul_param_->deep_ * sizeof(float));

-  c4_input_ =
-    reinterpret_cast<float *>(malloc(inputs_[0]->ElementsC4Num() / conv_param_->input_batch_ * sizeof(float)));
-  if (c4_input_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc c4_input_ error!";
+  pack_output_ = reinterpret_cast<float *>(malloc(matmul_param_->row_8_ * matmul_param_->col_8_ * sizeof(float)));
+  if (pack_output_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 Malloc pack_output_ error!";
    return RET_MEMORY_FAILED;
  }
+  memset(pack_output_, 0, matmul_param_->row_8_ * matmul_param_->col_8_ * sizeof(float));
  return RET_OK;
 }

 void Convolution1x1CPUKernel::Pre1x1Trans(float *src_input, float *src_output) {
  output_ptr_ = src_output;
-  PackNHWCToNC4HW4Fp32(src_input, c4_input_, 1, conv_param_->input_h_ * conv_param_->input_w_,
-                       conv_param_->input_channel_);

-  if (!pre_trans_input_) {
-    input_ptr_ = c4_input_;
-    return;
+  if (pre_trans_input_) {
+    Conv1x1InputPackFp32(src_input, input_ptr_, conv_param_);
+  } else {
+    input_ptr_ = src_input;
  }

-  Conv1x1InputPackFp32(c4_input_, input_ptr_, conv_param_);
+  RowMajor2Col8Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
  return;
 }

@@ -152,53 +152,26 @@ int Convolution1x1CPUKernel::Init() {
  return RET_OK;
 }

-int Convolution1x1CPUKernel::DoStrassen(int task_id) {
-  matmul_param_->row_ = MSMIN(thread_hw_stride_, matmul_param_->row_ - task_id * thread_hw_stride_);
-  if (matmul_param_->row_ <= 0) {
-    return RET_OK;
-  }
-
-  auto error_code = Conv1x1Fp32(input_ptr_ + task_id * thread_hw_stride_ * C4NUM, weight_ptr_,
-                                c4_output_ + task_id * thread_hw_stride_ * C4NUM,
-                                tmp_ptr_ + task_id * thread_hw_stride_ * matmul_param_->deep_ * C4NUM, *matmul_param_);
-  if (error_code != 0) {
-    MS_LOG(ERROR) << "DoStrassen error task_id[" << task_id << "] error_code[" << error_code << "]";
-    return RET_ERROR;
-  }
-  matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
-  return RET_OK;
-}
-
-int Convolution1x1StrassenRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
-  auto conv1x1 = reinterpret_cast<Convolution1x1CPUKernel *>(cdata);
-  auto error_code = conv1x1->DoStrassen(task_id);
-  if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "Convolution1x1StrassenRun error task_id[" << task_id << "] error_code[" << error_code << "]";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int Convolution1x1CPUKernel::DoPostFunc(int task_id) {
-  int cur_oc = MSMIN(thread_oc_stride_, conv_param_->output_channel_ - task_id * thread_oc_stride_);
+int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
+  int cur_oc = MSMIN(thread_stride_, matmul_param_->col_8_ - task_id * thread_stride_);
  if (cur_oc <= 0) {
    return RET_OK;
  }

-  float *cur_bias =
-    (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float *>(bias_data_) + task_id * thread_oc_stride_;
+  auto bias = (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id;
+
+  MatMul(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_,
+         pack_output_ + task_id * thread_stride_ * matmul_param_->row_8_, bias, matmul_param_->act_type_,
+         matmul_param_->deep_, matmul_param_->row_8_, cur_oc);

-  PostConvFuncFp32(c4_output_ + matmul_param_->row_ * thread_oc_stride_ * task_id,
-                   output_ptr_ + task_id * thread_oc_stride_, cur_bias, cur_oc, matmul_param_->row_,
-                   conv_param_->output_channel_, conv_param_->is_relu_, conv_param_->is_relu6_);
  return RET_OK;
 }

-int Convolution1x1PostFuncRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int Convolution1x1Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
  auto conv1x1 = reinterpret_cast<Convolution1x1CPUKernel *>(cdata);
-  auto error_code = conv1x1->DoPostFunc(task_id);
+  auto error_code = conv1x1->DoConv1x1(task_id);
  if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "Convolution1x1PostFuncRun error task_id[" << task_id << "] error_code[" << error_code << "]";
+    MS_LOG(ERROR) << "Convolution1x1Run error task_id[" << task_id << "] error_code[" << error_code << "]";
    return RET_ERROR;
  }
  return RET_OK;
@@ -209,20 +182,16 @@ int Convolution1x1CPUKernel::Run() {
  auto src_out = reinterpret_cast<float *>(outputs_[0]->Data());

  for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
-    Pre1x1Trans(src_in + batch_index * matmul_param_->deep_ * matmul_param_->a_stride_,
-                src_out + batch_index * matmul_param_->col_ * matmul_param_->c_stride_);
+    Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_,
+                src_out + batch_index * matmul_param_->row_ * matmul_param_->col_);

-    int error_code = LiteBackendParallelLaunch(Convolution1x1StrassenRun, this, thread_hw_count_);
+    int error_code = LiteBackendParallelLaunch(Convolution1x1Run, this, thread_count_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "conv1x1 strassen error error_code[" << error_code << "]";
      return RET_ERROR;
    }

-    error_code = LiteBackendParallelLaunch(Convolution1x1PostFuncRun, this, thread_oc4_count_);
-    if (error_code != RET_OK) {
-      MS_LOG(ERROR) << "conv1x1 post function error error_code[" << error_code << "]";
-      return RET_ERROR;
-    }
+    Row8x8Major2RowMajor(pack_output_, output_ptr_, matmul_param_->row_, matmul_param_->col_);
  }
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.h
@@ -17,6 +17,7 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_1X1_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_1X1_H_

+#include <float.h>
 #include <vector>
 #include "src/lite_kernel.h"
 #include "include/errorcode.h"
@@ -26,21 +27,24 @@
 #include "src/runtime/kernel/arm/base/layout_transform.h"
 #include "src/runtime/kernel/arm/opclib/fp32/conv.h"
 #include "src/runtime/kernel/arm/opclib/fp32/common_func.h"
+#include "src/runtime/kernel/arm/opclib/matmul.h"
+#include "src/runtime/kernel/arm/opclib/fp32/matmul.h"

 namespace mindspore::kernel {
 class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
 public:
  Convolution1x1CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                          const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
-      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {
+    matmul_param_ = new MatMulParameter();
+  }
  ~Convolution1x1CPUKernel();
  int Init() override;
  int Run() override;
  int ReSize() override;

 public:
-  int DoStrassen(int task_id);
-  int DoPostFunc(int task_id);
+  int DoConv1x1(int task_id);

 private:
  int InitConv1x1Param();
@@ -49,20 +53,15 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
  void Pre1x1Trans(float *src_input, float *src_output);

 private:
-  StrassenMatMulParameter *matmul_param_ = nullptr;
+  MatMulParameter *matmul_param_ = nullptr;
  bool pre_trans_input_ = false;
  int thread_count_ = 0;
-  int thread_hw_count_ = 0;
-  int thread_hw_stride_ = 0;
-  int thread_oc4_count_ = 0;
-  int thread_oc_stride_ = 0;
+  int thread_stride_ = 0;
  float *weight_ptr_ = nullptr;
-  float *tmp_ptr_ = nullptr;
-  float *c4_input_ = nullptr;
-  float *c4_output_ = nullptr;
+  float *pack_input_ = nullptr;
+  float *pack_output_ = nullptr;
  float *input_ptr_ = nullptr;
  float *output_ptr_ = nullptr;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_1X1_H_
-
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
@@ -30,27 +30,38 @@ DeConvolutionCPUKernel::~DeConvolutionCPUKernel() {
    free(weight_ptr_);
    weight_ptr_ = nullptr;
  }
-  if (tmp_output_ != nullptr) {
-    free(tmp_output_);
-    tmp_output_ = nullptr;
-  }
-
  if (tmp_buffer_ != nullptr) {
    free(tmp_buffer_);
    tmp_buffer_ = nullptr;
  }
-  if (c4_input_ != nullptr) {
-    free(c4_input_);
-    c4_input_ = nullptr;
+  if (pack_input_ != nullptr) {
+    free(pack_input_);
+    pack_input_ = nullptr;
  }
-  if (c4_output_ != nullptr) {
-    free(c4_output_);
-    c4_output_ = nullptr;
+  if (pack_output_ != nullptr) {
+    free(pack_output_);
+    pack_output_ = nullptr;
  }
  return;
 }

-int DeConvolutionCPUKernel::ReSize() { return 0; }
+int DeConvolutionCPUKernel::ReSize() {
+  if (tmp_buffer_ != nullptr) {
+    free(tmp_buffer_);
+    tmp_buffer_ = nullptr;
+  }
+  if (pack_input_ != nullptr) {
+    free(pack_input_);
+    pack_input_ = nullptr;
+  }
+  if (pack_output_ != nullptr) {
+    free(pack_output_);
+    pack_output_ = nullptr;
+  }
+  InitParam();
+
+  return RET_OK;
+}

 int DeConvolutionCPUKernel::InitWeightBias() {
  if (inputs_.size() == 3) {
@@ -65,60 +76,50 @@ int DeConvolutionCPUKernel::InitWeightBias() {
    bias_data_ = nullptr;
  }

-  size_t weight_pack_size = conv_param_->kernel_w_ * conv_param_->kernel_h_ *
-                            UP_ROUND(conv_param_->output_channel_, C4NUM) *
-                            UP_ROUND(conv_param_->input_channel_, C4NUM) * sizeof(float);
+  size_t weight_pack_size = conv_param_->input_channel_ * conv_param_->kernel_w_ * conv_param_->kernel_h_ *
+                            UP_ROUND(conv_param_->output_channel_, C8NUM) * sizeof(float);
  weight_ptr_ = reinterpret_cast<float *>(malloc(weight_pack_size));
  if (weight_ptr_ == nullptr) {
    MS_LOG(ERROR) << "deconv malloc weight_ptr_ error!";
    return RET_ERROR;
  }
  memset(weight_ptr_, 0, weight_pack_size);
-  PackDeConvWeightFp32(reinterpret_cast<float *>(inputs_[1]->Data()), weight_ptr_, conv_param_->input_channel_,
-                       conv_param_->output_channel_, conv_param_->kernel_w_ * conv_param_->kernel_h_);
+  PackNHWCToC8HWN8Fp32(reinterpret_cast<float *>(inputs_[1]->Data()), weight_ptr_, conv_param_->input_channel_,
+                       kernel_plane_, conv_param_->output_channel_);
  return RET_OK;
 }

 int DeConvolutionCPUKernel::InitParam() {
-  matmul_param_ = new StrassenMatMulParameter();
-  matmul_param_->row_ = conv_param_->input_h_ * conv_param_->input_w_;
-  matmul_param_->deep_ = UP_DIV(conv_param_->input_channel_, C4NUM);
-  matmul_param_->col_ = UP_DIV(conv_param_->output_channel_, 4) * conv_param_->kernel_w_ * conv_param_->kernel_h_;
-  matmul_param_->a_stride_ = matmul_param_->row_ * C4NUM;
-  matmul_param_->b_stride_ = matmul_param_->deep_ * C4NUM * C4NUM;
-  matmul_param_->c_stride_ = matmul_param_->row_ * C4NUM;
-
-  thread_hw_count_ = MSMIN(opParameter->thread_num_, matmul_param_->row_);
-  thread_hw_stride_ = UP_DIV(matmul_param_->row_, thread_hw_count_);
-
-  thread_co4_count_ = MSMIN(opParameter->thread_num_, UP_DIV(conv_param_->output_channel_, C4NUM));
-  thread_co_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C4NUM), thread_co4_count_) * C4NUM;
-
-  tmp_buffer_ =
-    reinterpret_cast<float *>(malloc(matmul_param_->a_stride_ * matmul_param_->deep_ * C4NUM * sizeof(float)));
-  if (tmp_buffer_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc tmp_buffer_ error!";
+  input_plane_ = conv_param_->input_h_ * conv_param_->input_w_;
+  kernel_plane_ = conv_param_->kernel_w_ * conv_param_->kernel_h_;
+  output_plane_ = conv_param_->output_h_ * conv_param_->output_w_;
+
+  matmul_param_->row_ = input_plane_;
+  matmul_param_->deep_ = conv_param_->input_channel_;
+  matmul_param_->col_ = conv_param_->output_channel_ * kernel_plane_;
+  matmul_param_->row_8_ = UP_ROUND(matmul_param_->row_, C8NUM);
+  matmul_param_->col_8_ = UP_ROUND(conv_param_->output_channel_, C8NUM) * kernel_plane_;
+
+  thread_count_ = MSMIN(opParameter->thread_num_, UP_DIV(conv_param_->output_channel_, C8NUM));
+  thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM), thread_count_);
+
+  pack_input_ = reinterpret_cast<float *>(malloc(matmul_param_->row_8_ * matmul_param_->deep_ * sizeof(float)));
+  if (pack_input_ == nullptr) {
+    MS_LOG(ERROR) << "deconv Malloc pack_input_ error!";
    return RET_ERROR;
  }

-  tmp_output_ = reinterpret_cast<float *>(malloc(matmul_param_->row_ * matmul_param_->col_ * C4NUM * sizeof(float)));
-  if (tmp_output_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc tmp_output_ error!";
-    return RET_ERROR;
-  }
-
-  c4_input_ =
-    reinterpret_cast<float *>(malloc(inputs_[0]->ElementsC4Num() / conv_param_->input_batch_ * sizeof(float)));
-  if (c4_input_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc c4_input_ error!";
+  pack_output_ =
+    reinterpret_cast<float *>(malloc(UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float)));
+  if (pack_output_ == nullptr) {
+    MS_LOG(ERROR) << "deconv Malloc pack_output_ error!";
    return RET_NULL_PTR;
  }

-  c4_output_ =
-    reinterpret_cast<float *>(malloc(outputs_[0]->ElementsC4Num() / conv_param_->output_batch_ * sizeof(float)));
-  if (c4_output_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc c4_output_ error!";
-    return RET_NULL_PTR;
+  tmp_buffer_ = reinterpret_cast<float *>(malloc(matmul_param_->row_8_ * matmul_param_->col_8_ * sizeof(float)));
+  if (tmp_buffer_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 Malloc tmp_buffer_ error!";
+    return RET_ERROR;
  }
  return RET_OK;
 }
@@ -132,6 +133,7 @@ int DeConvFp32Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
  }
  return RET_OK;
 }
+
 int DeConvFp32PostRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
  auto deconv = reinterpret_cast<DeConvolutionCPUKernel *>(cdata);
  auto error_code = deconv->DoPostFunc(task_id);
@@ -141,51 +143,39 @@ int DeConvFp32PostRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
  }
  return RET_OK;
 }
+
 int DeConvolutionCPUKernel::DoDeconv(int task_id) {
-  matmul_param_->row_ = MSMIN(thread_hw_stride_, matmul_param_->row_ - task_id * thread_hw_stride_);
-  if (matmul_param_->row_ <= 0) {
+  int oc = MSMIN(thread_stride_, UP_DIV(conv_param_->output_channel_, C8NUM) - task_id * thread_stride_);
+  if (oc <= 0) {
    return RET_OK;
  }

-  int error_code = DeConvFp32(c4_input_ + task_id * thread_hw_stride_ * C4NUM, weight_ptr_,
-                              tmp_output_ + task_id * thread_hw_stride_ * C4NUM,
-                              tmp_buffer_ + task_id * thread_hw_stride_ * matmul_param_->deep_ * C4NUM, *matmul_param_);
-  if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "DeConvFp32 error! error code: " << error_code;
-    return error_code;
-  }
+  MatMul(pack_input_, weight_ptr_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
+         tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_8_, nullptr, ActType_No,
+         matmul_param_->deep_, matmul_param_->row_8_, oc * C8NUM * kernel_plane_);

-  matmul_param_->row_ = conv_param_->input_h_ * conv_param_->input_w_;
  return RET_OK;
 }

 int DeConvolutionCPUKernel::DoPostFunc(int task_id) {
-  int input_plane = conv_param_->input_h_ * conv_param_->input_w_;
-  int kernel_plane = conv_param_->kernel_w_ * conv_param_->kernel_h_;
-  int output_plane = conv_param_->output_h_ * conv_param_->output_w_;
-
-  int cur_oc = MSMIN(thread_co_stride_, conv_param_->output_channel_ - task_id * thread_co_stride_);
-  if (cur_oc <= 0) {
+  int oc = MSMIN(thread_stride_ * C8NUM, conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM);
+  if (oc <= 0) {
    return RET_OK;
  }

-  float *cur_bias =
-    (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float *>(bias_data_) + thread_co_stride_ * task_id;
+  float *bias =
+    (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id * C8NUM;

-  DeConvPostFp32(tmp_output_ + thread_co_stride_ * task_id * input_plane * kernel_plane,
-                 c4_output_ + thread_co_stride_ * task_id * output_plane, output_ptr_ + thread_co_stride_ * task_id,
-                 cur_bias, cur_oc, input_plane, kernel_plane, output_plane, conv_param_);
+  DeConvPostFp32C8x8(tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_8_,
+                     pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_, bias,
+                     output_ptr_ + task_id * thread_stride_ * C8NUM, oc, conv_param_);
  return RET_OK;
 }

 int DeConvolutionCPUKernel::Init() {
-  int error_code = ConvolutionBaseCPUKernel::Init();
-  if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "Conv base init error!";
-    return error_code;
-  }
+  ConvolutionBaseCPUKernel::Init();

-  error_code = InitParam();
+  int error_code = InitParam();
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "deconv InitParam error!";
    return error_code;
@@ -204,20 +194,18 @@ int DeConvolutionCPUKernel::Run() {
  float *src_out = reinterpret_cast<float *>(outputs_[0]->Data());

  for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
-    input_ptr_ = src_in + batch_index * conv_param_->input_w_ * conv_param_->input_h_ * conv_param_->input_channel_;
-    output_ptr_ =
-      src_out + batch_index * conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_channel_;
+    input_ptr_ = src_in + batch_index * input_plane_ * conv_param_->input_channel_;
+    output_ptr_ = src_out + batch_index * output_plane_ * conv_param_->output_channel_;

-    PackNHWCToNC4HW4Fp32(input_ptr_, c4_input_, 1, conv_param_->input_h_ * conv_param_->input_w_,
-                         conv_param_->input_channel_);
+    RowMajor2Col8Major(input_ptr_, pack_input_, input_plane_, conv_param_->input_channel_);

-    int error_code = LiteBackendParallelLaunch(DeConvFp32Run, this, thread_hw_count_);
+    int error_code = LiteBackendParallelLaunch(DeConvFp32Run, this, thread_count_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]";
      return RET_ERROR;
    }

-    error_code = LiteBackendParallelLaunch(DeConvFp32PostRun, this, thread_co4_count_);
+    error_code = LiteBackendParallelLaunch(DeConvFp32PostRun, this, thread_count_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "deconv fp32 postrun error! error_code[" << error_code << "]";
      return RET_ERROR;

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.h
@@ -17,6 +17,7 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_H_

+#include <float.h>
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/kernel_registry.h"
@@ -24,13 +25,16 @@
 #include "schema/model_generated.h"
 #include "src/runtime/kernel/arm/base/convolution_base.h"
 #include "src/runtime/kernel/arm/opclib/fp32/deconv.h"
+#include "src/runtime/kernel/arm/opclib/fp32/matmul.h"

 namespace mindspore::kernel {
 class DeConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
 public:
  DeConvolutionCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                         const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
-      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {
+    matmul_param_ = new MatMulParameter();
+  }
  ~DeConvolutionCPUKernel() override;
  int Init() override;
  int Run() override;
@@ -45,19 +49,18 @@ class DeConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
  int InitWeightBias();

 private:
-  StrassenMatMulParameter *matmul_param_;
-  int thread_hw_count_;
-  int thread_hw_stride_;
-  int thread_co4_count_;
-  int thread_co_stride_;
+  MatMulParameter *matmul_param_;
+  int input_plane_;
+  int kernel_plane_;
+  int output_plane_;
+  int thread_count_;
+  int thread_stride_;
  float *weight_ptr_;
+  float *pack_input_;
+  float *pack_output_;
  float *tmp_buffer_;
-  float *tmp_output_;
-  float *c4_input_;
-  float *c4_output_;
  float *input_ptr_;
  float *output_ptr_;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_H_
-
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc
@@ -99,8 +99,8 @@ int FullconnectionCPUKernel::DoMatmul(int task_id) {

  MatMul(a_c8_ptr_, b_r8_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->deep_,
         c_r8x8_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->row_8_,
-         bias_ptr_ + task_id * thread_stride_ * C8NUM, fc_param_->maxf_, fc_param_->minf_, fc_param_->deep_,
-         fc_param_->row_8_, cur_oc * 8);
+         bias_ptr_ + task_id * thread_stride_ * C8NUM, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_8_,
+         cur_oc * 8);
  return RET_OK;
 }


--- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
@@ -82,9 +82,9 @@ int FullconnectionInt8CPUKernel::Init() {
  double real_multiplier = quant_params_.input.scale_ * quant_params_.weight.scale_ / quant_params_.output.scale_;
  QuantizeRoundParameter(real_multiplier, &quant_params_.quant_multiplier, &quant_params_.left_shift,
                         &quant_params_.right_shift);
-  CalculateActivationRangeQuantized(fc_param_->maxf_, fc_param_->minf_, quant_params_.output.scale_,
-                                    quant_params_.output.zp_, &quant_params_.out_act_max, &quant_params_.out_act_min);
-
+  CalculateActivationRangeQuantized(fc_param_->act_type_ == ActType_Relu, fc_param_->act_type_ == ActType_Relu6,
+                                    quant_params_.output.zp_, quant_params_.output.scale_, &quant_params_.out_act_max,
+                                    &quant_params_.out_act_min);
  return RET_OK;
 }


--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.cc
@@ -63,23 +63,29 @@ void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr
  return;
 }

-void PostConvFuncFp32(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
-                      size_t plane_size, size_t stride, bool is_relu, bool is_relu6) {
-#ifndef ENABLE_ARM64
+void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_ptr, size_t output_channel,
+                      size_t plane_size, size_t stride, bool is_relu, bool is_relu6, int size) {
  for (int oc = 0; oc < output_channel; oc++) {
-    int oc4div = oc / 4, oc4mod = oc % 4;
+    int oc_div = oc / size, oc_mod = oc % size;
    for (int hw = 0; hw < plane_size; hw++) {
-      int src_index = oc4div * 4 * plane_size + hw * 4 + oc4mod;
+      int src_index = oc_div * size * plane_size + hw * size + oc_mod;
      int dst_index = hw * stride + oc;
-      float value = c4_out_ptr[src_index];
+      float value = src_ptr_[src_index];
      if (bias_ptr != nullptr) {
        value = value + bias_ptr[oc];
      }
-      value = (is_relu) ? (MSMAX(0, value)) : (value);
-      value = (is_relu6) ? (MSMIN(6, MSMAX(0, value))) : (value);
+      value = (is_relu || is_relu6) ? (MSMAX(0.f, value)) : (value);
+      value = (is_relu6) ? (MSMIN(6.f, value)) : (value);
      out_ptr[dst_index] = value;
    }
  }
+  return;
+}
+
+void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
+                        size_t plane_size, size_t stride, bool is_relu, bool is_relu6) {
+#ifndef ENABLE_ARM64
+  PostConvFuncComm(c4_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C4NUM);
 #else
  if (bias_ptr != nullptr) {
    if (is_relu) {
@@ -102,3 +108,8 @@ void PostConvFuncFp32(const float *c4_out_ptr, float *out_ptr, const float *bias
  return;
 }

+void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
+                        size_t plane_size, size_t stride, bool is_relu, bool is_relu6) {
+  PostConvFuncComm(c8_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C8NUM);
+  return;
+}
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h
@@ -27,8 +27,10 @@
 extern "C" {
 #endif

-void PostConvFuncFp32(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
-                      size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
+void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
+                        size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
+void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
+                        size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
 void MatrixAdd(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride,
               size_t row, size_t col);
 void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stride, size_t b_stride, size_t c_stride,
@@ -60,4 +62,3 @@ void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_
 #endif

 #endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */
-
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/deconv.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/deconv.cc
@@ -38,8 +38,52 @@ int DeConvFp32(const float *input, const float *weight, float *output, float *tm
  return StrassenMatmul(input, weight, output, &matmul_param, FP32_STRASSEN_MAX_RECURSION, 0, tmp_buffer);
 }

-int DeConvPostFp32(const float *src, float *tmp_c4, float *dst, const float *bias, int output_channel, int input_plane,
-                   int kernel_plane, int output_plane, ConvParameter *conv_param) {
+int DeConvPostFp32C8x8(const float *src, float *tmp, const float *bias, float *dst, int output_channel,
+                       ConvParameter *conv_param) {
+  /* row8x8-major(ih*iw x oc*kh*kw)  ->  row8-major(oh*ow x oc) */
+  size_t input_plane = conv_param->input_w_ * conv_param->input_h_;
+  size_t kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
+  size_t output_plane = conv_param->output_w_ * conv_param->output_h_;
+  int oc8 = UP_DIV(output_channel, C8NUM);
+  int in_plane8 = UP_ROUND(input_plane, C8NUM);
+
+  for (int c = 0; c < oc8; c++) {
+    float *dst_ptr = tmp + c * output_plane * C8NUM;
+    const float *src_ptr = src + c * in_plane8 * kernel_plane * C8NUM;
+    memset(dst_ptr, 0, output_plane * C8NUM * sizeof(int32_t));
+
+    for (int ih = 0; ih < conv_param->input_h_; ih++) {
+      for (int iw = 0; iw < conv_param->input_w_; iw++) {
+        int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
+        int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
+
+        int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
+        int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
+        int kw_start = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
+        int kw_end = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
+        for (int kh = kh_start; kh < kh_end; kh++) {
+          for (int kw = kw_start; kw < kw_end; kw++) {
+            int src_index = ih * conv_param->input_w_ * C8NUM + iw * C8NUM +
+                            kh * in_plane8 * conv_param->kernel_w_ * C8NUM + kw * in_plane8 * C8NUM;
+            int dst_index = oh * conv_param->output_w_ * C8NUM + ow * C8NUM +
+                            kh * conv_param->dilation_h_ * conv_param->output_w_ * C8NUM +
+                            kw * conv_param->dilation_w_ * C8NUM;
+            for (int i = 0; i < C8NUM; i++) {
+              dst_ptr[dst_index + i] += src_ptr[src_index + i];
+            }
+          } /*kw*/
+        }   /*kh*/
+      }     /*iw*/
+    }       /*ih*/
+  }         /*oc8*/
+
+  PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_,
+                     conv_param->is_relu6_);
+  return OPCLIB_OK;
+}
+
+int DeConvPostFp32C4(const float *src, float *tmp_c4, float *dst, const float *bias, int output_channel,
+                     int input_plane, int kernel_plane, int output_plane, ConvParameter *conv_param) {
  int oc4 = UP_DIV(output_channel, C4NUM);
  for (int c = 0; c < oc4; c++) {
    float *dst_ptr = tmp_c4 + c * output_plane * C4NUM;
@@ -71,8 +115,7 @@ int DeConvPostFp32(const float *src, float *tmp_c4, float *dst, const float *bia
    }       /*ih*/
  }         /*oc4*/

-  PostConvFuncFp32(tmp_c4, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_,
-                   conv_param->is_relu6_);
+  PostConvFuncFp32C4(tmp_c4, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_,
+                     conv_param->is_relu6_);
  return OPCLIB_OK;
 }
-
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/deconv.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/deconv.h
@@ -26,8 +26,9 @@ void PackDeConvWeightFp32(const float *weight, float *dst, int input_channel, in
 int DeConvFp32(const float *input, const float *weight, float *output, float *tmp_buffer,
               StrassenMatMulParameter matmul_param);

-int DeConvPostFp32(const float *src, float *tmp_c4, float *dst, const float *bias, int output_channel, int input_plane,
-                   int kernel_plane, int output_plane, ConvParameter *conv_param);
+int DeConvPostFp32C4(const float *src, float *tmp_c4, float *dst, const float *bias, int output_channel,
+                     int input_plane, int kernel_plane, int output_plane, ConvParameter *conv_param);
+int DeConvPostFp32C8x8(const float *src, float *tmp_out, const float *bias, float *dst, int output_channel,
+                       ConvParameter *conv_param);

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_DECONV_H_
-
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/matmul.cc
@@ -48,10 +48,11 @@ void Row8x8Major2RowMajor(float *src_ptr, float *dst_ptr, int row, int col) {
      dst_ptr[r * col + c] = src_ptr[cd8 * row8 * 8 + r * 8 + cm8];
    }
  }
+  return;
 }

-void MatMul8x8(const float *a, const float *b, float *c, const float *bias, float maxf, float minf, int deep,
-               int row_8_, int col_8_) {
+void MatMul8x8(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row_8_,
+               int col_8_) {
  /*  col8-major * row8-major => col8x8-major  */
  for (int row = 0; row < row_8_; row++) {
    for (int col = 0; col < col_8_; col++) {
@@ -64,19 +65,25 @@ void MatMul8x8(const float *a, const float *b, float *c, const float *bias, floa
        size_t bi = c8div * deep * 8 + d * 8 + c8mod;
        value = value + a[ai] * b[bi];
      }
-      value += bias[col];
-      value = MSMIN(maxf, value);
-      value = MSMAX(minf, value);
+      if (bias != nullptr) {
+        value += bias[col];
+      }
+      if (act_type == ActType_Relu6) value = MSMIN(6.0f, value);
+      if (act_type != ActType_No) value = MSMAX(0.0f, value);
      c[ci] = value;
    }
  }
+  return;
 }

-void MatMul(const float *a, const float *b, float *c, const float *bias, float maxf, float minf, int deep, int row_8_,
+void MatMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row_8_,
            int col_8_) {
 #ifdef __aarch64__
+  float minf = (act_type == ActType_No) ? FLT_MIN : 0.f;
+  float maxf = (act_type == ActType_Relu6) ? 6.0f : FLT_MAX;
  MatMulFloatNeon64(a, b, c, bias, maxf, minf, deep, row_8_, col_8_);
 #else
-  MatMul8x8(a, b, c, bias, maxf, minf, deep, row_8_, col_8_);
+  MatMul8x8(a, b, c, bias, act_type, deep, row_8_, col_8_);
 #endif
+  return;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/matmul.h
@@ -17,12 +17,12 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_MATMUL_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_MATMUL_H_

+#include <float.h>
 #include "src/runtime/kernel/arm/opclib/errorcode.h"
 #include "src/runtime/kernel/arm/opclib/op_base.h"
 #include "src/runtime/kernel/arm/opclib/matmul.h"

-void MatMul(const float *a, const float *b, float *c, const float *bias, float maxf, float minf, int depth, int row,
-            int col);
+void MatMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int row, int col);
 void RowMajor2Row8Major(float *src_ptr, float *dst_ptr, int row, int col);
 void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, int row, int col);
 void Row8x8Major2RowMajor(float *src_ptr, float *dst_ptr, int row, int col);

--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/deconv.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/deconv.cc
@@ -25,15 +25,18 @@ int DeConvInt8(const int8_t *input, const int8_t *weight, int32_t *output, size_

 int DeConvPostInt8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8_t *out, int output_channel,
                   ConvParameter *conv_param) {
-  int oc8 = UP_DIV(output_channel, C8NUM);
+  /* row8x8-major(ih*iw x oc*kh*kw)  ->  row8x8-major(oh*ow x oc) */
  size_t input_plane = conv_param->input_w_ * conv_param->input_h_;
  size_t kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
  size_t output_plane = conv_param->output_w_ * conv_param->output_h_;
+  int oc8 = UP_DIV(output_channel, C8NUM);
+  int in_plane8 = UP_ROUND(input_plane, 8);
+  int out_plane8 = UP_ROUND(output_plane, 8);

  for (int c = 0; c < oc8; c++) {
-    int32_t *dst_ptr = tmp + c * output_plane * C8NUM;
-    const int32_t *src_ptr = src + c * input_plane * kernel_plane * C8NUM;
-    memset(dst_ptr, 0, output_plane * C8NUM * sizeof(int32_t));
+    int32_t *dst_ptr = tmp + c * out_plane8 * C8NUM;
+    const int32_t *src_ptr = src + c * in_plane8 * kernel_plane * C8NUM;
+    memset(dst_ptr, 0, out_plane8 * C8NUM * sizeof(int32_t));

    for (int ih = 0; ih < conv_param->input_h_; ih++) {
      for (int iw = 0; iw < conv_param->input_w_; iw++) {
@@ -60,7 +63,7 @@ int DeConvPostInt8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8_t
    }       /*ih*/
  }         /*oc8*/

-  PostFuncInt8(tmp, bias, out, output_channel, output_plane, UP_ROUND(output_plane, 8),
+  PostFuncInt8(tmp, bias, out, output_channel, output_plane, out_plane8,
               conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0],
               conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.quant_args_[2][0].zp_,
               conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]);

--- a/mindspore/lite/src/runtime/kernel/arm/opclib/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/matmul.h
@@ -19,6 +19,8 @@

 #include "src/runtime/kernel/arm/opclib/op_base.h"

+enum ActType { ActType_No, ActType_Relu, ActType_Relu6 };
+
 struct MatMulParameter {
  OpParameter op_parameter_;
  int row_;
@@ -26,12 +28,10 @@ struct MatMulParameter {
  int row_8_;
  int col_8_;
  int deep_;
-  float minf_;
-  float maxf_;
  bool has_bias_;
  bool a_transpose_; /* false :  row-major  */
  bool b_transpose_; /* true  :  col-major  */
+  ActType act_type_;
 };

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_MATMUL_H_
-
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc
@@ -150,23 +150,21 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
 }

 void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param) {
-  for (int c = 0; c < UP_DIV(conv_param->input_channel_, C4NUM); c++) {
-    const float *src_c_ptr = src + c * conv_param->input_h_ * conv_param->input_w_ * C4NUM;
-    float *dst_c_ptr = dst + c * conv_param->output_h_ * conv_param->output_w_ * C4NUM;
-    for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) {
-      int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_;
-      if (src_h < 0 || src_h >= conv_param->input_h_) {
+  /* support nhwc */
+  for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) {
+    int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_;
+    if (src_h < 0 || src_h >= conv_param->input_h_) {
+      continue;
+    }
+    const float *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_;
+    float *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_;
+    for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) {
+      int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_;
+      if (src_w < 0 || src_w >= conv_param->input_w_) {
        continue;
      }
-      const float *src_h_ptr = src_c_ptr + src_h * conv_param->input_w_ * C4NUM;
-      float *dst_h_ptr = dst_c_ptr + dst_h * conv_param->output_w_ * C4NUM;
-      for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) {
-        int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_;
-        if (src_w < 0 || src_w >= conv_param->input_w_) {
-          continue;
-        }
-        memcpy(dst_h_ptr + dst_w * C4NUM, src_h_ptr + src_w * C4NUM, C4NUM * sizeof(float));
-      }
+      memcpy(dst_h_ptr + dst_w * conv_param->input_channel_, src_h_ptr + src_w * conv_param->input_channel_,
+             conv_param->input_channel_ * sizeof(float));
    }
  }
  return;
@@ -572,6 +570,21 @@ void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int
  }
 }

+void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int channel) {
+  for (int n = 0; n < batch; n++) {
+    for (int hw = 0; hw < plane; hw++) {
+      for (int c = 0; c < channel; c++) {
+        int c8div = c / C8NUM;
+        int c8mod = c % C8NUM;
+        int src_index = n * plane * channel + hw * channel + c;
+        int dst_index = c8div * batch * plane * C8NUM + hw * batch * C8NUM + n * C8NUM + c8mod;
+        ((float *)dst)[dst_index] = ((float *)src)[src_index];
+      }
+    }
+  }
+  return;
+}
+
 void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel) {
  int c4 = UP_DIV(channel, C4NUM);
  int nhwc4_batch_unit_offset = c4 * C4NUM * plane;

--- a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h
@@ -69,6 +69,8 @@ void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int

 void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel);

+void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int channel);
+
 void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel);

 void PackNHWC4ToNHWCInt8(const void *src, void *dst, int batch, int plane, int channel);

--- a/mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h
@@ -21,6 +21,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <limits.h>
+#include <limits>

 struct QuantArg {
  double scale_;
@@ -112,13 +113,21 @@ inline uint8_t QuantizeToUint8(float real_value, float scale, int32_t zp) { retu

 inline int32_t QuantizeToInt8(float real_value, float scale, int32_t zp) { return round(real_value / scale + zp); }

-inline void CalculateActivationRangeQuantized(float fmax, float fmin, float scale, int zero_point, int *imax,
-                                              int *imin) {
-  int8_t qmin = (int8_t)CHAR_MIN;
-  int8_t qmax = (int8_t)CHAR_MAX;
-  int8_t qfmin = QuantizeToInt8(fmin, scale, zero_point);
-  int8_t qfmax = QuantizeToInt8(fmax, scale, zero_point);
-  *imin = qmin < qfmin ? qmin : qfmin;
-  *imax = qmax > qfmax ? qmax : qfmax;
+inline void CalculateActivationRangeQuantized(bool is_relu, bool is_relu6, int32_t zp, int32_t scale, int *mini,
+                                              int *maxi) {
+  int32_t min = std::numeric_limits<int8_t>::min();
+  int32_t max = std::numeric_limits<int8_t>::max();
+  int32_t quantized_zero = QuantizeToInt8(0, scale, zp);
+  int32_t quantized_six = QuantizeToInt8(6, scale, zp);
+  if (is_relu) {
+    min = min > quantized_zero ? min : quantized_zero;
+  } else if (is_relu6) {
+    min = min > quantized_zero ? min : quantized_zero;
+    max = max < quantized_six ? max : quantized_six;
+  } else {
+    // do nothing
+  }
+  *mini = min;
+  *maxi = max;
 }
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_QUANTIZATION_QUANTIZE_H_
--- a/mindspore/lite/test/run_test.sh
+++ b/mindspore/lite/test/run_test.sh
@@ -6,5 +6,15 @@ BUILD_DIR=${CUR_DIR}/../build
 mkdir -pv ${CUR_DIR}/do_test
 cd ${CUR_DIR}/do_test
 cp ${BUILD_DIR}/test/lite-test ./
+cp -r ${CUR_DIR}/ut/src/runtime/kernel/arm/test_data/* ./

 ./lite-test --gtest_filter="*TestHebing*"
+
+./lite-test --gtest_filter=TestFcFp32*
+./lite-test --gtest_filter=TestConv1x1Fp32*
+./lite-test --gtest_filter=TestStrassenFp32*
+./lite-test --gtest_filter=TestDeConvolutionFp32*
+
+
+./lite-test --gtest_filter=TestPadInt8*
+
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sys/time.h>
+#include <iostream>
+#include <memory>
+#include "utils/log_adapter.h"
+#include "common/common_test.h"
+#include "src/common/file_utils.h"
+#include "src/runtime/kernel/arm/fp32/fullconnection.h"
+#include "src/runtime/kernel/arm/opclib/fp32/matmul.h"
+
+namespace mindspore {
+using mindspore::lite::tensor::Tensor;
+
+class TestFcFp32 : public mindspore::Common {
+ public:
+  TestFcFp32() {}
+};
+
+int FcTestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
+                MatMulParameter *matmal_param, float **correct) {
+  Tensor *in_t = new Tensor(kNumberTypeFloat, {2, 2, 2, 2}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  in_t->MallocData();
+  float in[] = {-3.2366564, -4.7733846, -7.8329225, 16.146885, 5.060793,  -6.1471,  -1.7680453, -6.5721383,
+                17.87506,   -5.1192183, 10.742863,  1.4536934, 19.693445, 19.45783, 5.063163,   0.5234792};
+  memcpy(in_t->Data(), in, sizeof(float) * in_t->ElementsNum());
+  inputs_->push_back(in_t);
+
+  Tensor *weight_t = new Tensor(kNumberTypeFloat, {3, 8}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  weight_t->MallocData();
+  float weight[] = {-0.0024438887, 0.0006738146, -0.008169129, 0.0021510671,  -0.012470592,   -0.0053063435,
+                    0.006050155,   0.008656233,  0.012911413,  -0.0028635843, -0.00034080597, -0.0010622552,
+                    -0.012254699,  -0.01312836,  0.0025241964, -0.004706142,  0.002451482,    -0.009558459,
+                    0.004481974,   0.0033251503, -0.011705584, -0.001720293,  -0.0039410214,  -0.0073637343};
+  memcpy(weight_t->Data(), weight, sizeof(float) * weight_t->ElementsNum());
+  inputs_->push_back(weight_t);
+
+  Tensor *bias_t = new Tensor(kNumberTypeFloat, {3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  bias_t->MallocData();
+  float bias[] = {1.6103756, -0.9872417, 0.546849};
+  memcpy(bias_t->Data(), bias, sizeof(float) * bias_t->ElementsNum());
+  inputs_->push_back(bias_t);
+
+  Tensor *out_t = new Tensor(kNumberTypeFloat, {2, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
+  out_t->MallocData();
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<float *>(malloc(out_t->ElementsNum() * sizeof(float)));
+  float nchw_co[] = {1.6157111, -0.98469573, 0.6098231, 1.1649342, -1.2334653, 0.404779};
+  memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(float));
+
+  matmal_param->b_transpose_ = true;
+  matmal_param->a_transpose_ = false;
+  matmal_param->has_bias_ = true;
+  matmal_param->act_type_ = ActType_No;
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestFcFp32, FcTest1) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto matmul_param = new MatMulParameter();
+  float *correct;
+  int total_size = FcTestInit1(&inputs_, &outputs_, matmul_param, &correct);
+  lite::Context *ctx = new lite::Context;
+  ctx->threadNum = 2;
+  kernel::FullconnectionCPUKernel *fc =
+    new kernel::FullconnectionCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
+
+  fc->Init();
+  fc->Run();
+  CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
+}
+
+int FcTestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
+                MatMulParameter *matmal_param, float **correct) {
+  size_t buffer_size;
+
+  Tensor *in_t = new Tensor(kNumberTypeFloat, {20, 4, 2, 10}, schema::Format_NCHW, static_cast<schema::NodeType>(1));
+  in_t->MallocData();
+  std::string in_path = "./matmul/FcFp32_input1.bin";
+  auto in_data = mindspore::lite::ReadFile(in_path.c_str(), &buffer_size);
+  memcpy(in_t->Data(), in_data, buffer_size);
+  inputs_->push_back(in_t);
+
+  Tensor *weight_t = new Tensor(kNumberTypeFloat, {30, 80}, schema::Format_NCHW, static_cast<schema::NodeType>(1));
+  weight_t->MallocData();
+  std::string weight_path = "./matmul/FcFp32_weight1.bin";
+  auto w_data = mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size);
+  memcpy(weight_t->Data(), w_data, buffer_size);
+  inputs_->push_back(weight_t);
+
+  Tensor *bias_t = new Tensor(kNumberTypeFloat, {30}, schema::Format_NCHW, static_cast<schema::NodeType>(1));
+  bias_t->MallocData();
+  std::string bias_path = "./matmul/FcFp32_bias1.bin";
+  auto bias_data = mindspore::lite::ReadFile(bias_path.c_str(), &buffer_size);
+  memcpy(bias_t->Data(), bias_data, buffer_size);
+  inputs_->push_back(bias_t);
+
+  Tensor *out_t = new Tensor(kNumberTypeFloat, {20, 30}, schema::Format_NCHW, static_cast<schema::NodeType>(1));
+  out_t->MallocData();
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<float *>(malloc(out_t->ElementsNum() * sizeof(float)));
+  std::string out_path = "./matmul/FcFp32_output1.bin";
+  auto out_data = mindspore::lite::ReadFile(out_path.c_str(), &buffer_size);
+  memcpy(*correct, out_data, out_t->ElementsNum() * sizeof(float));
+
+  matmal_param->b_transpose_ = true;
+  matmal_param->a_transpose_ = false;
+  matmal_param->has_bias_ = true;
+  matmal_param->act_type_ = ActType_No;
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestFcFp32, FcTest2) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto matmul_param = new MatMulParameter();
+  float *correct;
+  int total_size = FcTestInit2(&inputs_, &outputs_, matmul_param, &correct);
+  lite::Context *ctx = new lite::Context;
+  ctx->threadNum = 1;
+  kernel::FullconnectionCPUKernel *fc =
+    new kernel::FullconnectionCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
+
+  fc->Init();
+  fc->Run();
+  CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
+}
+}  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/strassen_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/strassen_fp32_tests.cc
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <memory>
+#include "common/common_test.h"
+#include "src/common/file_utils.h"
+#include "mindspore/lite/src/kernel_registry.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/pack.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/fp32/matmul.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/int8/deconv.h"
+#include "mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.h"
+
+using mindspore::lite::DeviceType;
+
+namespace mindspore {
+using mindspore::lite::tensor::QuantArg;
+using mindspore::lite::tensor::Tensor;
+using mindspore::schema::Format_NHWC;
+using mindspore::schema::NodeType_Parameter;
+class TestDeconvInt8 : public mindspore::Common {
+ public:
+  TestDeconvInt8() {}
+};
+
+void FloatToInt8(float *fptr, int8_t *iptr, size_t size, int32_t zp, double scale) {
+  for (int i = 0; i < size; i++) {
+    int32_t value = round(fptr[i] / scale + zp);
+    value = MSMIN(value, INT8_MAX);
+    value = MSMAX(value, INT8_MIN);
+    iptr[i] = (int8_t)value;
+  }
+}
+
+TEST_F(TestDeconvInt8, PackWeight1) {
+  int8_t in[] = {-8, 11,   99,  -80, 8,    -12, 37,  -45, 31,   -69, -66, 26,  112, 124, -109, 85,  -24, 28,  -46, 100,
+                 72, -36,  -82, 64,  -110, 37,  -72, 65,  -124, 91,  -43, 99,  3,   100, 19,   51,  -14, -81, 67,  90,
+                 4,  -106, 105, 28,  -61,  -79, 55,  -54, 47,   -38, 114, 125, -65, 100, 6,    -72, -33, 60,  109, -68};
+  int8_t co[] = {-8,   11,  99, -80, 8,   -12, 0,   0,   112, 124,  -109, 85,  -24, 28,  0,   0,   -110, 37,  -72, 65,
+                 -124, 91,  0,  0,   -14, -81, 67,  90,  4,   -106, 0,    0,   47,  -38, 114, 125, -65,  100, 0,   0,
+                 37,   -45, 31, -69, -66, 26,  0,   0,   -46, 100,  72,   -36, -82, 64,  0,   0,   -43,  99,  3,   100,
+                 19,   51,  0,  0,   105, 28,  -61, -79, 55,  -54,  0,    0,   6,   -72, -33, 60,  109,  -68, 0,   0};
+  int8_t dst[80] = {0};
+  /*5*1*2*6 nhwc*/
+  PackNHWCToC8HWN8Int8(in, dst, 5, 2, 6);
+  CompareOutputData(dst, co, 80, 1);
+}
+
+TEST_F(TestDeconvInt8, PackWeight2) {
+  int8_t in[] = {
+    40,   24,   94,   122, 67,  34,  -89, 31,   -43, 121,  48,   -54,  44,   -91,  35,   89,   -37, 114,  -8,   103,
+    -22,  32,   26,   112, -92, -23, 43,  9,    81,  118,  -73,  -54,  65,   -99,  51,   -90,  121, -62,  119,  -93,
+    21,   -92,  -1,   -82, -71, -54, 63,  -93,  92,  -93,  99,   122,  -104, -16,  -8,   -32,  90,  -126, 51,   91,
+    4,    70,   -7,   116, 99,  81,  -79, 124,  -14, 28,   97,   9,    -97,  99,   88,   -15,  54,  26,   77,   -25,
+    113,  119,  119,  -75, -17, 7,   7,   1,    69,  66,   40,   -13,  80,   -115, -98,  -8,   -17, 31,   88,   65,
+    -1,   -15,  -98,  77,  56,  119, -20, -32,  -54, -58,  -16,  52,   121,  126,  -33,  43,   92,  -34,  -17,  -52,
+    104,  -52,  -91,  76,  79,  105, 102, -65,  43,  32,   13,   15,   -38,  95,   -18,  -82,  -7,  118,  -79,  -85,
+    120,  -15,  2,    32,  -94, 111, 115, 102,  -18, 121,  -106, 54,   63,   111,  -16,  92,   82,  -23,  111,  53,
+    1,    -48,  45,   19,  -4,  -15, -72, 41,   80,  -51,  116,  31,   94,   101,  -10,  18,   0,   -49,  108,  28,
+    -36,  47,   -14,  -2,  -10, 31,  -92, -84,  74,  -114, -107, 66,   99,   -121, -107, 31,   -38, 56,   -30,  109,
+    -7,   28,   -22,  -17, -3,  -2,  27,  -3,   108, -84,  -23,  -71,  -54,  20,   -45,  109,  -42, 78,   -79,  98,
+    -10,  57,   52,   1,   25,  73,  21,  -78,  46,  121,  66,   92,   24,   55,   4,    -110, -37, 112,  -18,  10,
+    -42,  16,   -9,   31,  39,  -70, 108, -3,   -90, -60,  -121, 11,   50,   -88,  -104, -29,  -89, 94,   64,   -91,
+    -101, -7,   23,   -57, 93,  16,  17,  35,   -48, -25,  13,   -121, 73,   -68,  -54,  -122, -20, 12,   64,   20,
+    -11,  -6,   -71,  -52, -97, 109, 116, -107, 117, -124, 56,   80,   -108, 30,   123,  56,   -80, 39,   -18,  -97,
+    -103, 122,  114,  -10, -31, 97,  -92, 105,  -61, -25,  10,   -119, -106, 41,   77,   -117, 55,  -83,  -29,  14,
+    27,   -106, -86,  41,  43,  23,  11,  -76,  -34, 121,  94,   18,   69,   73,   100,  54,   43,  32,   13,   15,
+    -38,  95,   -18,  -82, -7,  118, -79, -85,  120, -15,  2,    32,   -94,  111,  115,  102,  -18, 121,  -106, 54,
+    63,   111,  -16,  92,  82,  -23, 111, 53,   1,   -48,  45,   19,   -4,   -15,  -72,  41,   80,  -51,  116,  31,
+    94,   101,  -10,  18,  0,   -49, 108, 28,   -36, 47,   -14,  -2,   -10,  31,   -92,  -84,  74,  -114, -107, 66,
+    99,   -121, -107, 31,  -38, 56,  -30, 109,  -7,  28,   -22,  -17,  -3,   -2,   27,   -3,   108, -84,  -23,  -71,
+    -54,  20,   -45,  109, -42, 78,  -79, 98,   -10, 57,   52,   1,    25,   73,   21,   -78,  46,  121,  66,   92};
+  int8_t co[] = {
+    40,   24,   94,   122,  67,   34,   -89,  31,   -22, 32,   26,   112,  -92,  -23,  43,   9,   21,   -92, -1,   -82,
+    -71,  -54,  63,   -93,  4,    70,   -7,   116,  99,  81,   -79,  124,  113,  119,  119,  -75, -17,  7,   7,    1,
+    -1,   -15,  -98,  77,   56,   119,  -20,  -32,  104, -52,  -91,  76,   79,   105,  102,  -65, 120,  -15, 2,    32,
+    -94,  111,  115,  102,  1,    -48,  45,   19,   -4,  -15,  -72,  41,   -36,  47,   -14,  -2,  -10,  31,  -92,  -84,
+    -7,   28,   -22,  -17,  -3,   -2,   27,   -3,   -10, 57,   52,   1,    25,   73,   21,   -78, -42,  16,  -9,   31,
+    39,   -70,  108,  -3,   -101, -7,   23,   -57,  93,  16,   17,   35,   -11,  -6,   -71,  -52, -97,  109, 116,  -107,
+    -103, 122,  114,  -10,  -31,  97,   -92,  105,  27,  -106, -86,  41,   43,   23,   11,   -76, -38,  95,  -18,  -82,
+    -7,   118,  -79,  -85,  63,   111,  -16,  92,   82,  -23,  111,  53,   94,   101,  -10,  18,  0,    -49, 108,  28,
+    99,   -121, -107, 31,   -38,  56,   -30,  109,  -54, 20,   -45,  109,  -42,  78,   -79,  98,  -43,  121, 48,   -54,
+    44,   -91,  35,   89,   81,   118,  -73,  -54,  65,  -99,  51,   -90,  92,   -93,  99,   122, -104, -16, -8,   -32,
+    -14,  28,   97,   9,    -97,  99,   88,   -15,  69,  66,   40,   -13,  80,   -115, -98,  -8,  -54,  -58, -16,  52,
+    121,  126,  -33,  43,   43,   32,   13,   15,   -38, 95,   -18,  -82,  -18,  121,  -106, 54,  63,   111, -16,  92,
+    80,   -51,  116,  31,   94,   101,  -10,  18,   74,  -114, -107, 66,   99,   -121, -107, 31,  108,  -84, -23,  -71,
+    -54,  20,   -45,  109,  46,   121,  66,   92,   24,  55,   4,    -110, -90,  -60,  -121, 11,  50,   -88, -104, -29,
+    -48,  -25,  13,   -121, 73,   -68,  -54,  -122, 117, -124, 56,   80,   -108, 30,   123,  56,  -61,  -25, 10,   -119,
+    -106, 41,   77,   -117, -34,  121,  94,   18,   69,  73,   100,  54,   120,  -15,  2,    32,  -94,  111, 115,  102,
+    1,    -48,  45,   19,   -4,   -15,  -72,  41,   -36, 47,   -14,  -2,   -10,  31,   -92,  -84, -7,   28,  -22,  -17,
+    -3,   -2,   27,   -3,   -10,  57,   52,   1,    25,  73,   21,   -78,  -37,  114,  -8,   103, 0,    0,   0,    0,
+    121,  -62,  119,  -93,  0,    0,    0,    0,    90,  -126, 51,   91,   0,    0,    0,    0,   54,   26,  77,   -25,
+    0,    0,    0,    0,    -17,  31,   88,   65,   0,   0,    0,    0,    92,   -34,  -17,  -52, 0,    0,   0,    0,
+    -7,   118,  -79,  -85,  0,    0,    0,    0,    82,  -23,  111,  53,   0,    0,    0,    0,   0,    -49, 108,  28,
+    0,    0,    0,    0,    -38,  56,   -30,  109,  0,   0,    0,    0,    -42,  78,   -79,  98,  0,    0,   0,    0,
+    -37,  112,  -18,  10,   0,    0,    0,    0,    -89, 94,   64,   -91,  0,    0,    0,    0,   -20,  12,  64,   20,
+    0,    0,    0,    0,    -80,  39,   -18,  -97,  0,   0,    0,    0,    55,   -83,  -29,  14,  0,    0,   0,    0,
+    43,   32,   13,   15,   0,    0,    0,    0,    -18, 121,  -106, 54,   0,    0,    0,    0,   80,   -51, 116,  31,
+    0,    0,    0,    0,    74,   -114, -107, 66,   0,   0,    0,    0,    108,  -84,  -23,  -71, 0,    0,   0,    0,
+    46,   121,  66,   92,   0,    0,    0,    0};
+  int8_t dst[528] = {0};
+  PackNHWCToC8HWN8Int8(in, dst, 22, 1, 20);
+  CompareOutputData(dst, co, 528, 1);
+}
+
+TEST_F(TestDeconvInt8, MatMulTest1) {
+  int8_t a_row_major_10_12[] = {
+    -6, 76,  32,  80,  -73, 8,   -85, -3,  114, 80,  30,  42,  -41, 117,  62,  -76, -77, -111, 88,  105,
+    68, 105, -74, 13,  51,  94,  31,  -52, -92, -4,  -35, -71, 101, -93,  46,  -65, 57,  -41,  -51, 77,
+    1,  9,   73,  -19, -36, 57,  81,  -24, 40,  103, 112, 109, -41, -68,  57,  61,  55,  -20,  3,   2,
+    17, -16, -31, 58,  -4,  67,  -4,  -95, -5,  -72, 81,  15,  -7,  -16,  -47, 112, 114, -26,  -98, 53,
+    15, -49, 26,  19,  19,  8,   -57, -35, -79, 118, 29,  21,  37,  -48,  83,  7,   124, 113,  -5,  15,
+    -8, 107, -65, -88, 50,  -47, -80, -84, 3,   -45, 92,  42,  -20, -101, 106, -10, 89,  67,   55,  10};
+  int32_t zp_a = 15;
+  int8_t a_col8_major[16 * 12] = {0};
+  int8_t b_col_major_12_18[] = {
+    92,  27,   22,   52,  -112, -20, -57, -2,   89,   32,  93,   -66,  -25, -54, 94,  -97, -119, -98,  101,  -99,
+    77,  -83,  76,   95,  59,   97,  8,   40,   -109, -20, 67,   -107, 37,  -6,  -54, -20, -30,  36,   -106, -103,
+    -3,  -86,  -82,  59,  4,    -75, -50, -106, 55,   104, -117, -71,  -20, -85, -77, 16,  -25,  -58,  4,    80,
+    -75, 94,   32,   -68, 2,    40,  56,  -103, 11,   -98, -70,  -69,  0,   57,  -6,  82,  66,   -112, -61,  33,
+    -77, -53,  95,   -38, 87,   -46, -3,  81,   -47,  43,  21,   26,   -45, -57, 50,  -24, -82,  -114, 61,   46,
+    -53, 78,   -24,  31,  -7,   37,  29,  38,   45,   106, 52,   -42,  31,  -6,  -61, -87, 2,    79,   -5,   -42,
+    43,  -106, -104, 7,   91,   -63, 58,  97,   -15,  74,  -96,  15,   -23, -3,  -47, -97, 100,  -54,  26,   -46,
+    35,  26,   100,  -80, 34,   -25, 96,  -67,  -80,  -27, 66,   41,   41,  -43, -43, -38, -4,   -64,  31,   7,
+    -8,  6,    -2,   39,  -119, 53,  75,  -91,  -44,  77,  -62,  22,   -44, 78,  -67, -48, -115, -4,   43,   81,
+    40,  -20,  -5,   -89, 60,   -62, -4,  -48,  66,   -64, -69,  62,   17,  -89, 1,   87,  81,   32,   -29,  51,
+    40,  27,   66,   67,  11,   -69, 85,  -79,  -106, 55,  22,   -23,  62,  69,  -74, 49};
+  int32_t zp_b = -20;
+  int8_t b_row8_major[12 * 24] = {0};
+  int32_t co_row_major_10_18[] = {
+    32005,  3597,   16595,  -3458,  6627,   -6663,  818,    -3910,  10228,  15079,  -19205, -10203, -3178,  -10046,
+    10374,  -6199,  5330,   12163,  1819,   20533,  17382,  18283,  9778,   9185,   -12623, -26234, -11987, 7904,
+    8144,   -1603,  27611,  -10190, -20053, 4999,   -28389, 21852,  24680,  25858,  23506,  17944,  11768,  24378,
+    -6102,  -4675,  -23460, 10434,  -47579, 1986,   12018,  -19418, -7248,  4938,   -32613, -941,   8171,   -4788,
+    3325,   -11310, -8351,  -14786, 6909,   16401,  2017,   -6456,  11242,  7393,   -9119,  17312,  2646,   -14402,
+    7201,   -9949,  23986,  17607,  27461,  -1547,  2783,   7558,   19487,  11158,  -2686,  6328,   -8225,  -11668,
+    21858,  -2079,  -8671,  -639,   -1544,  1235,   1156,   6582,   2829,   -10311, -2692,  5154,   1527,   10870,
+    106,    -8189,  -24174, -1846,  -15399, -3598,  14874,  -5591,  -619,   -13667, -6053,  -31103, -24499, 13008,
+    9143,   -17982, 28437,  2176,   -2114,  -11631, 10779,  -1032,  -24690, -3112,  2125,   432,    20270,  -33859,
+    8907,   10063,  1603,   3761,   4805,   4904,   -15594, 10786,  4287,   -13591, -18777, -1679,  2109,   -2243,
+    12051,  -8504,  -6558,  4209,   13606,  -25803, 27922,  12092,  7140,   27142,  -12267, 2339,   -26224, 23674,
+    -26579, -11398, -1823,  -18976, 3641,   4415,   -24878, -2045,  15937,  41465,  12601,  -14513, -17619, -5728,
+    334,    -424,   8147,   -1369,  5984,   11000,  19016,  4456,   -25920, 4506,   5930,   15458};
+  int32_t c_row8x8_major[16 * 24] = {0};
+
+  int32_t out_row_major[180] = {0};
+  RowMajor2Col8MajorInt8(a_row_major_10_12, a_col8_major, 10, 12);
+  RowMajor2Col8MajorInt8(b_col_major_12_18, b_row8_major, 18, 12);
+  MatMulInt8(a_col8_major, b_row8_major, c_row8x8_major, 16, 24, 12, zp_a, zp_b);
+  Row8x8Major2RowMajor(reinterpret_cast<float *>(c_row8x8_major), reinterpret_cast<float *>(out_row_major), 10, 18);
+  CompareOutputData(out_row_major, co_row_major_10_18, 180, 1);
+}
+
+TEST_F(TestDeconvInt8, PostAddTest1) {
+  int32_t in[] = {
+    -4956,  -3923,  868,   -8880, -4089, -5179, -4526, -4527, -10464, 99,    -5826, -2995, -4519, -4519, -10509, -2505,
+    -11272, 434,    -4522, -4523, -5287, -8936, -878,  373,   -4528,  -4529, -1960, -6589, 1688,  2287,  -8059,  926,
+    -2506,  -6972,  -2834, -8281, -8118, -3110, -4526, -4527, -4528,  -4529, -4519, -4519, -4519, -4519, -4519,  -4519,
+    -4520,  -4521,  -4522, -4523, -4524, -4525, -4526, -4527, -4528,  -4529, -4519, -4519, -4519, -4519, -4519,  -4519,
+    1578,   2231,   -4522, -4523, -4524, -4525, -4526, -4527, -8449,  -990,  -4519, -4519, -4519, -4519, -4519,  -4519,
+    -4303,  -10293, -4522, -4523, -4524, -4525, -4526, -4527, -4528,  -4529, -4519, -4519, -4519, -4519, -4519,  -4519,
+    -7025,  924,    -4522, -4523, -4524, -4525, -4526, -4527, -4528,  -4529, -4519, -4519, -4519, -4519, -4519,  -4519,
+    -4520,  -4521,  -4522, -4523, -4524, -4525, -4526, -4527, -4528,  -4529, -4519, -4519, -4519, -4519, -4519,  -4519};
+  int8_t co[] = {-8,  11,  99,  -80,  8,  -12, 0,  0,   112, 124, -109, 85, -24,  28, 0,   0,  -110,
+                 37,  -72, 65,  -124, 91, 0,   0,  -14, -81, 67,  90,   4,  -106, 0,  0,   47, -38,
+                 114, 125, -65, 100,  0,  0,   37, -45, 31,  -69, -66,  26, 0,    0,  -46, 100};
+  int32_t bias[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  int8_t out[50] = {0};
+  double multiplier = 0.0183649725490196;
+  int32_t quant_multiplier;
+  int32_t left_shift;
+  int32_t right_shift;
+  QuantizeRoundParameter(multiplier, &quant_multiplier, &left_shift, &right_shift);
+  int32_t zp = 83;
+  PostFuncInt8(in, bias, out, 10, 5, 8, quant_multiplier, left_shift, right_shift, zp, -128, 127);
+  CompareOutputData(out, co, 50, 1);
+
+  int8_t co_relu[] = {0, 11, 99, 0, 8, 0, 0, 0,  112, 124, 0,   85, 0,   28, 0, 0,  0, 37, 0, 65, 0,  91, 0, 0, 0,
+                      0, 67, 90, 4, 0, 0, 0, 47, 0,   114, 125, 0,  100, 0,  0, 37, 0, 31, 0, 0,  26, 0,  0, 0, 100};
+  PostFuncInt8(in, bias, out, 10, 5, 8, quant_multiplier, left_shift, right_shift, zp, 0, 127);
+  CompareOutputData(out, co_relu, 50, 1);
+
+  int8_t co_relu6[] = {0, 6, 6, 0, 6, 0, 0, 0, 6, 6, 0, 6, 0, 6, 0, 0, 0, 6, 0, 6, 0, 6, 0, 0, 0,
+                       0, 6, 6, 4, 0, 0, 0, 6, 0, 6, 6, 0, 6, 0, 0, 6, 0, 6, 0, 0, 6, 0, 0, 0, 6};
+  PostFuncInt8(in, bias, out, 10, 5, 8, quant_multiplier, left_shift, right_shift, zp, 0, 6);
+  CompareOutputData(out, co_relu6, 50, 1);
+}
+
+int DeConvInt8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
+                        ConvParameter *conv_param, int8_t **correct) {
+  /* float data from deconv fp32 testcase : DeConvTestInit2 */
+  /*   vq = (vi - zp) * s     vi = vq / s + zp */
+  Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 4, 2, 3}, Format_NHWC, NodeType_Parameter);
+  in_t->MallocData();
+  int8_t in[] = {6, 43, 38, 24, -8, 12, 41, -24, -20, 41, -19, -6, -26, -6, 23, -31, 34, 45, 8, 45, -39, -27, -48, 12};
+  memcpy(in_t->Data(), in, sizeof(int8_t) * in_t->ElementsNum());
+  QuantArg *in_quant_arg = new QuantArg();
+  in_quant_arg->zeroPoint = -19, in_quant_arg->scale = 0.31228156;
+  in_t->AddQuantParam(*in_quant_arg);
+  inputs_->push_back(in_t);
+
+  Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 3, 3, 2}, Format_NHWC, NodeType_Parameter);
+  weight_t->MallocData();
+  int8_t weight[] = {66, 89, 98, 74,  95, 86, 125, 95, 105, 83, 116, 94, 90, 80, 86, 59, 72, 92,
+                     64, 76, 92, 80,  90, 87, 106, 55, 105, 60, 75,  53, 81, 81, 98, 81, 86, 59,
+                     74, 82, 97, 105, 71, 67, 79,  87, 72,  79, 80,  76, 96, 80, 83, 71, 61, 79};
+  memcpy(weight_t->Data(), weight, sizeof(int8_t) * weight_t->ElementsNum());
+  QuantArg *w_quant_arg = new QuantArg();
+  w_quant_arg->zeroPoint = 83, w_quant_arg->scale = 0.023649725490196;
+  weight_t->AddQuantParam(*w_quant_arg);
+  inputs_->push_back(weight_t);
+
+  Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 7, 3, 2}, Format_NHWC, NodeType_Parameter);
+  out_t->MallocData();
+  QuantArg *out_quant_arg = new QuantArg();
+  out_quant_arg->zeroPoint = 31, out_quant_arg->scale = 0.3439215686275;
+  out_t->AddQuantParam(*out_quant_arg);
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
+  int8_t co_nchw[] = {57, 76, 49, 71,  8, 61, 57, 127, 56, 46, -11, 61, 23, 31,  34, 50, 59, 49, 78, 17, 6,
+                      -3, -5, 23, -11, 6, -5, 33, 64,  30, 21, 18,  25, 21, -15, 0,  4,  31, 36, 2,  17, 43};
+  PackNCHWToNHWCInt8(co_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel());
+
+  conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
+  conv_param->pad_h_ = conv_param->pad_w_ = 1;
+  conv_param->stride_h_ = conv_param->stride_w_ = 2;
+  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestDeconvInt8, DeConvInt8Test1) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto deconv_param = new ConvParameter();
+  lite::Context *ctx = new lite::Context;
+  ctx->threadNum = 2;
+  int8_t *correct;
+  int total_size = DeConvInt8TestInit1(&inputs_, &outputs_, deconv_param, &correct);
+  mindspore::kernel::DeConvInt8CPUKernel *deconv =
+    new mindspore::kernel::DeConvInt8CPUKernel(reinterpret_cast<OpParameter *>(deconv_param), inputs_, outputs_, ctx);
+
+  deconv->Init();
+  deconv->Run();
+  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 3);
+
+  delete deconv_param;
+  //  delete deconv;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  free(correct);
+}
+}  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc
@@ -27,7 +27,7 @@ namespace mindspore {
 using lite::tensor::Tensor;
 class TestFcInt8 : public mindspore::Common {
 public:
-  TestFcInt8(){}
+  TestFcInt8() {}
 };

 void Quantize(float *input_data, int length, float scale, int zero_point, int8_t *output_data) {
@@ -110,8 +110,7 @@ int FcInt8TestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lit
  matmal_param->b_transpose_ = true;
  matmal_param->a_transpose_ = false;
  matmal_param->has_bias_ = true;
-  matmal_param->minf_ = -FLT_MAX;
-  matmal_param->maxf_ = FLT_MAX;
+  matmal_param->act_type_ = ActType_No;
  return out_t->ElementsNum();
 }


--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/pad_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/pad_int8_tests.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include "include/context.h"
+#include "src/ir/tensor.h"
+#include "common/common_test.h"
+#include "src/common/file_utils.h"
+#include "src/runtime/kernel/arm/opclib/pad_parameter.h"
+#include "src/runtime/kernel/arm/int8/pad_int8.h"
+
+namespace mindspore {
+using mindspore::lite::tensor::QuantArg;
+using mindspore::lite::tensor::Tensor;
+
+class TestPadInt8 : public mindspore::Common {
+ public:
+  TestPadInt8() {}
+};
+
+int PadInt8TestInit1(std::vector<Tensor *> *inputs_, std::vector<Tensor *> *outputs_, PadParameter *pad_param,
+                     int8_t **correct) {
+  Tensor *in_t = new Tensor(kNumberTypeInt8, {3}, schema::Format_NHWC, schema::NodeType_Parameter);
+  in_t->MallocData();
+  int8_t in[] = {1, 1, 1};
+  memcpy(in_t->Data(), in, sizeof(int8_t) * in_t->ElementsNum());
+  QuantArg *in_quant_arg = new QuantArg();
+  in_quant_arg->zeroPoint = 10, in_quant_arg->scale = 0.31228156;
+  in_t->AddQuantParam(*in_quant_arg);
+  inputs_->push_back(in_t);
+
+  Tensor *out_t = new Tensor(kNumberTypeInt8, {7}, schema::Format_NHWC, schema::NodeType_Parameter);
+  out_t->MallocData();
+  QuantArg *out_quant_arg = new QuantArg();
+  out_quant_arg->zeroPoint = 10, out_quant_arg->scale = 0.31228156;
+  out_t->AddQuantParam(*out_quant_arg);
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
+  int8_t co[] = {10, 10, 1, 1, 1, 10, 10};
+  memcpy(*correct, co, out_t->ElementsNum() * sizeof(int8_t));
+
+  int padding[] = {0, 0, 0, 0, 0, 0, 2, 2};
+  memcpy(pad_param->paddings_, padding, MAX_PAD_SIZE * sizeof(int));
+  pad_param->constant_value_ = 0;
+
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestPadInt8, PadInt8Test1) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto pad_param = new PadParameter();
+  lite::Context *ctx = new lite::Context;
+  int8_t *correct;
+  int total_size = PadInt8TestInit1(&inputs_, &outputs_, pad_param, &correct);
+  kernel::PadInt8CPUKernel *pad =
+    new kernel::PadInt8CPUKernel(reinterpret_cast<OpParameter *>(pad_param), inputs_, outputs_, ctx);
+
+  pad->Init();
+  pad->Run();
+  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 0);
+
+  delete pad_param;
+  delete pad;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  free(correct);
+}
+
+int PadInt8TestInit2(std::vector<Tensor *> *inputs_, std::vector<Tensor *> *outputs_, PadParameter *pad_param,
+                     int8_t **correct) {
+  Tensor *in_t = new Tensor(kNumberTypeInt8, {6, 2}, schema::Format_NHWC, schema::NodeType_Parameter);
+  in_t->MallocData();
+  int8_t in[] = {18, 71, 99, -6, 5, -119, 86, 13, 15, -85, -41, -77};
+  memcpy(in_t->Data(), in, sizeof(int8_t) * in_t->ElementsNum());
+  QuantArg *in_quant_arg = new QuantArg();
+  in_quant_arg->zeroPoint = 10, in_quant_arg->scale = 0.31228156;
+  in_t->AddQuantParam(*in_quant_arg);
+  inputs_->push_back(in_t);
+
+  Tensor *out_t = new Tensor(kNumberTypeInt8, {10, 5}, schema::Format_NHWC, schema::NodeType_Parameter);
+  out_t->MallocData();
+  QuantArg *out_quant_arg = new QuantArg();
+  out_quant_arg->zeroPoint = 10, out_quant_arg->scale = 0.31228156;
+  out_t->AddQuantParam(*out_quant_arg);
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
+  int8_t co[] = {10, 10, 10, 10,  10, 10, 10, 10,  10,  10, 10,   10, 10, 10, 10, 10, 18,
+                 71, 10, 10, 10,  99, -6, 10, 10,  10,  5,  -119, 10, 10, 10, 86, 13, 10,
+                 10, 10, 15, -85, 10, 10, 10, -41, -77, 10, 10,   10, 10, 10, 10, 10};
+  memcpy(*correct, co, out_t->ElementsNum() * sizeof(int8_t));
+
+  int padding[] = {0, 0, 0, 0, 3, 1, 1, 2};
+  memcpy(pad_param->paddings_, padding, MAX_PAD_SIZE * sizeof(int));
+  pad_param->constant_value_ = 0;
+
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestPadInt8, PadInt8Test2) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto pad_param = new PadParameter();
+  lite::Context *ctx = new lite::Context;
+  int8_t *correct;
+  int total_size = PadInt8TestInit2(&inputs_, &outputs_, pad_param, &correct);
+  kernel::PadInt8CPUKernel *pad =
+    new kernel::PadInt8CPUKernel(reinterpret_cast<OpParameter *>(pad_param), inputs_, outputs_, ctx);
+
+  pad->Init();
+  pad->Run();
+  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 0);
+
+  delete pad_param;
+  delete pad;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  free(correct);
+}
+
+int PadInt8TestInit4(std::vector<Tensor *> *inputs_, std::vector<Tensor *> *outputs_, PadParameter *pad_param,
+                     int8_t **correct) {
+  Tensor *in_t = new Tensor(kNumberTypeInt8, {2, 3, 2, 1}, schema::Format_NHWC, schema::NodeType_Parameter);
+  in_t->MallocData();
+  int8_t in[] = {73, 24, 7, -31, -109, -2, 69, -64, 51, -45, 38, 53};
+  memcpy(in_t->Data(), in, sizeof(int8_t) * in_t->ElementsNum());
+  QuantArg *in_quant_arg = new QuantArg();
+  in_quant_arg->zeroPoint = 10, in_quant_arg->scale = 0.31228156;
+  in_t->AddQuantParam(*in_quant_arg);
+  inputs_->push_back(in_t);
+
+  Tensor *out_t = new Tensor(kNumberTypeInt8, {6, 6, 4, 3}, schema::Format_NHWC, schema::NodeType_Parameter);
+  out_t->MallocData();
+  QuantArg *out_quant_arg = new QuantArg();
+  out_quant_arg->zeroPoint = 10, out_quant_arg->scale = 0.31228156;
+  out_t->AddQuantParam(*out_quant_arg);
+  outputs_->push_back(out_t);
+
+  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
+  int8_t co[] = {
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 73, 10, 10, 24, 10, 10, 10,  10,
+    10, 10, 10, 10, 7,  10, 10, -31, 10, 10, 10, 10, 10, 10,  10, 10, -109, 10, 10, -2, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 69, 10, 10, -64, 10, 10, 10,   10, 10, 10, 10, 10, 51, 10, 10, -45, 10,
+    10, 10, 10, 10, 10, 10, 10, 38,  10, 10, 53, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10,
+    10, 10, 10, 10, 10, 10, 10, 10,  10, 10, 10, 10, 10, 10,  10, 10, 10,   10, 10, 10, 10, 10, 10, 10, 10, 10,  10};
+  memcpy(*correct, co, out_t->ElementsNum() * sizeof(int8_t));
+
+  int padding[] = {3, 1, 1, 2, 2, 0, 1, 1};
+  memcpy(pad_param->paddings_, padding, MAX_PAD_SIZE * sizeof(int));
+  pad_param->constant_value_ = 0;
+
+  return out_t->ElementsNum();
+}
+
+TEST_F(TestPadInt8, PadInt8TestInit4) {
+  std::vector<lite::tensor::Tensor *> inputs_;
+  std::vector<lite::tensor::Tensor *> outputs_;
+  auto pad_param = new PadParameter();
+  lite::Context *ctx = new lite::Context;
+  int8_t *correct;
+  int total_size = PadInt8TestInit2(&inputs_, &outputs_, pad_param, &correct);
+  kernel::PadInt8CPUKernel *pad =
+    new kernel::PadInt8CPUKernel(reinterpret_cast<OpParameter *>(pad_param), inputs_, outputs_, ctx);
+
+  pad->Init();
+  pad->Run();
+  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 0);
+
+  delete pad_param;
+  delete pad;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  free(correct);
+}
+}  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_output1_nhwc.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_output1_nhwc.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nchw_output1.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nchw_output1.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nhwc_input1.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nhwc_input1.bin