[MS][LITE] optimize arm cpu int8 op conv depthwise: add common and slide window functions to select

8d06c2b8 · yangruoqi713 · e6112ed1 · 8d06c2b8 · 8d06c2b8 · 8d06c2b8
10 changed file
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
@@ -29,7 +29,7 @@ mov x6, x1
 mov x7, x2
 mov x8, x4

-    LoopInputDepth16In:
+    LoopDepth16In:
    cmp x8, #16
    blt L4
    sub x8, x8, #16
@@ -39,8 +39,8 @@ mov x8, x4
    ld1 {v16.4s, v17.4s}, [x0], #32

    cmp x8, #16
-    blt LoopInputDepth16Out
-    LoopInputDepth16:
+    blt LoopDepth16Out
+    LoopDepth16:
    fmla v16.4s, v0.4s, v2.4s
    fmla v17.4s, v1.4s, v3.4s

@@ -61,9 +61,9 @@ mov x8, x4

    sub x8, x8, #16
    cmp x8, #16
-    bge LoopInputDepth16
+    bge LoopDepth16

-    LoopInputDepth16Out:
+    LoopDepth16Out:
    fmla v16.4s, v0.4s, v2.4s
    fmla v17.4s, v1.4s, v3.4s
    st1 {v16.4s, v17.4s}, [x9], #32
@@ -81,7 +81,7 @@ mov x8, x4
    cmp x8, #4
    blt L0

-    LoopInputDepth4:
+    LoopDepth4:
    ld1 {v0.4s}, [x6], #16
    ld1 {v2.4s}, [x7], #16
    ld1 {v16.4s}, [x0], #16
@@ -89,13 +89,13 @@ mov x8, x4
    st1 {v16.4s}, [x9], #16
    sub x8, x8, #4
    cmp x8, #4
-    bge LoopInputDepth4
+    bge LoopDepth4

    L0:
    cmp x8, #0
    beq Loop16LineEnd

-    LoopInputDepth0:
+    LoopDepth0:
    ldr s0, [x6], #4
    ldr s1, [x7], #4
    ldr s2, [x0], #4
@@ -103,7 +103,7 @@ mov x8, x4
    fadd s2, s2, s0
    str s2, [x9], #4
    subs x8, x8, #1
-    bne LoopInputDepth0
+    bne LoopDepth0

    Loop16LineEnd:


--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
+#ifdef __aarch64__
+
+.text
+.align 5
+.global ConvDwInt8PostAlign4
+#ifndef __APPLE__
+.type ConvDwInt8PostAlign4, %function
+#endif
+
+// void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
+//                           int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
+// x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
+// x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max
+
+ConvDwInt8PostAlign4:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    ldr x8, [sp]
+
+    dup v26.4s, w5
+    dup v27.4s, w4
+    dup v28.4s, w6
+
+    dup v29.4s, w3
+    dup v30.4s, w7
+    dup v31.4s, w8
+
+    cmp x2, 16
+    blt LoopDepth8
+
+    LoopDepth16:
+        ld1 {v0.4s}, [x1], #16
+        ld1 {v1.4s}, [x1], #16
+        ld1 {v2.4s}, [x1], #16
+        ld1 {v3.4s}, [x1], #16
+
+        sqshl v0.4s, v0.4s, v26.4s
+        sqshl v1.4s, v1.4s, v26.4s
+        sqshl v2.4s, v2.4s, v26.4s
+        sqshl v3.4s, v3.4s, v26.4s
+
+        sqrdmulh v0.4s, v0.4s, v27.4s
+        sqrdmulh v1.4s, v1.4s, v27.4s
+        sqrdmulh v2.4s, v2.4s, v27.4s
+        sqrdmulh v3.4s, v3.4s, v27.4s
+
+        and v16.16b, v28.16b, v0.16b
+        sshr v16.4s, v16.4s, #31
+        sqadd v0.4s, v0.4s, v16.4s
+        srshl v0.4s, v0.4s, v28.4s
+        and v17.16b, v28.16b, v1.16b
+        sshr v17.4s, v17.4s, #31
+        sqadd v1.4s, v1.4s, v17.4s
+        srshl v1.4s, v1.4s, v28.4s
+        and v18.16b, v28.16b, v2.16b
+        sshr v18.4s, v18.4s, #31
+        sqadd v2.4s, v2.4s, v18.4s
+        srshl v2.4s, v2.4s, v28.4s
+        and v19.16b, v28.16b, v3.16b
+        sshr v19.4s, v19.4s, #31
+        sqadd v3.4s, v3.4s, v19.4s
+        srshl v3.4s, v3.4s, v28.4s
+
+        add v0.4s, v0.4s, v29.4s
+        add v1.4s, v1.4s, v29.4s
+        add v2.4s, v2.4s, v29.4s
+        add v3.4s, v3.4s, v29.4s
+
+        smax v0.4s, v0.4s, v30.4s
+        smax v1.4s, v1.4s, v30.4s
+        smax v2.4s, v2.4s, v30.4s
+        smax v3.4s, v3.4s, v30.4s
+
+        smin v0.4s, v0.4s, v31.4s
+        smin v1.4s, v1.4s, v31.4s
+        smin v2.4s, v2.4s, v31.4s
+        smin v3.4s, v3.4s, v31.4s
+
+        sqxtn v0.4h, v0.4s
+        sqxtn v1.4h, v1.4s
+        sqxtn v2.4h, v2.4s
+        sqxtn v3.4h, v3.4s
+
+        sqxtn v0.8b, v0.8h
+        sqxtn v1.8b, v1.8h
+        sqxtn v2.8b, v2.8h
+        sqxtn v3.8b, v3.8h
+
+        st1 {v0.s}[0], [x0], #4
+        st1 {v1.s}[0], [x0], #4
+        st1 {v2.s}[0], [x0], #4
+        st1 {v3.s}[0], [x0], #4
+
+        sub x2, x2, #16
+        cmp x2, #16
+        bge LoopDepth16
+
+    LoopDepth8:
+        cmp x2, #8
+        blt LoopDepth4
+        ld1 {v0.4s}, [x1], #16
+        ld1 {v1.4s}, [x1], #16
+
+        sqshl v0.4s, v0.4s, v26.4s
+        sqshl v1.4s, v1.4s, v26.4s
+
+        sqrdmulh v0.4s, v0.4s, v27.4s
+        sqrdmulh v1.4s, v1.4s, v27.4s
+
+        and v16.16b, v28.16b, v0.16b
+        sshr v16.4s, v16.4s, #31
+        sqadd v0.4s, v0.4s, v16.4s
+        srshl v0.4s, v0.4s, v28.4s
+        and v17.16b, v28.16b, v1.16b
+        sshr v17.4s, v17.4s, #31
+        sqadd v1.4s, v1.4s, v17.4s
+        srshl v1.4s, v1.4s, v28.4s
+
+        add v0.4s, v0.4s, v29.4s
+        add v1.4s, v1.4s, v29.4s
+
+        smax v0.4s, v0.4s, v30.4s
+        smax v1.4s, v1.4s, v30.4s
+
+        smin v0.4s, v0.4s, v31.4s
+        smin v1.4s, v1.4s, v31.4s
+
+        sqxtn v0.4h, v0.4s
+        sqxtn v1.4h, v1.4s
+
+        sqxtn v0.8b, v0.8h
+        sqxtn v1.8b, v1.8h
+
+        st1 {v0.s}[0], [x0], #4
+        st1 {v1.s}[0], [x0], #4
+
+        sub x2, x2, #8
+        cmp x2, #8
+        bge LoopDepth8
+
+    LoopDepth4:
+        cmp x2, #4
+        blt End
+        ld1 {v0.4s}, [x1], #16
+
+        sqshl v0.4s, v0.4s, v26.4s
+        sqrdmulh v0.4s, v0.4s, v27.4s
+
+        and v16.16b, v28.16b, v0.16b
+        sshr v16.4s, v16.4s, #31
+        sqadd v0.4s, v0.4s, v16.4s
+        srshl v0.4s, v0.4s, v28.4s
+
+        add v0.4s, v0.4s, v29.4s
+        smax v0.4s, v0.4s, v30.4s
+        smin v0.4s, v0.4s, v31.4s
+
+        sqxtn v0.4h, v0.4s
+        sqxtn v0.8b, v0.8h
+
+        st1 {v0.s}[0], [x0], #4
+
+        sub x2, x2, #4
+        bge LoopDepth4
+    End:
+    ret
+#endif
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
+#ifdef __aarch64__
+
+.text
+.align 5
+.global ConvDwInt8Row
+#ifndef __APPLE__
+.type ConvDwInt8Row, %function
+#endif
+
+// void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
+//                    int output_channel, int input_step, int8_t input_zp)
+// x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels,
+// x4: output_channel, x5: input_step, x6: input_zp
+//
+ConvDwInt8Row:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+cmp x3, #0
+beq End
+
+mov x10, x0
+
+dup v31.8b, w6
+
+LoopOutPixel:
+mov x7, x1
+mov x8, x2
+mov x9, x4
+
+    LoopDepth16In:
+    cmp x9, #16
+    blt L8
+    sub x9, x9, #16
+
+    ld1 {v0.8b, v1.8b}, [x7], #16
+    ld1 {v2.8h, v3.8h}, [x8], #32
+    ld1 {v16.4s, v17.4s}, [x0], #32
+
+    ssubl v20.8h, v0.8b, v31.8b
+    smlal v16.4s, v20.4h, v2.4h
+    smlal2 v17.4s, v20.8h, v2.8h
+
+
+    cmp x9, #16
+    blt LoopDepth16Out
+    LoopDepth16:
+
+    st1 {v16.4s, v17.4s}, [x10], #32
+    ld1 {v18.4s, v19.4s}, [x0], #32
+    ssubl v21.8h, v1.8b, v31.8b
+    smlal v18.4s, v21.4h, v3.4h
+    smlal2 v19.4s, v21.8h, v3.8h
+    st1 {v18.4s, v19.4s}, [x10], #32
+
+    ld1 {v0.8b, v1.8b}, [x7], #16
+    ld1 {v2.8h, v3.8h}, [x8], #32
+    ld1 {v16.4s, v17.4s}, [x0], #32
+
+    ssubl v20.8h, v0.8b, v31.8b
+    smlal v16.4s, v20.4h, v2.4h
+    smlal2 v17.4s, v20.8h, v2.8h
+
+    sub x9, x9, #16
+    cmp x9, #16
+    bge LoopDepth16
+
+    LoopDepth16Out:
+
+    st1 {v16.4s, v17.4s}, [x10], #32
+    ld1 {v18.4s, v19.4s}, [x0], #32
+    ssubl v21.8h, v1.8b, v31.8b
+    smlal v18.4s, v21.4h, v3.4h
+    smlal2 v19.4s, v21.8h, v3.8h
+    st1 {v18.4s, v19.4s}, [x10], #32
+
+    L8:
+    cmp x9, #8
+    blt L0
+
+    LoopDepth8:
+    ld1 {v0.8b}, [x7], #8
+    ld1 {v2.8h}, [x8], #16
+    ld1 {v16.4s, v17.4s}, [x0], #32
+
+    ssubl v20.8h, v0.8b, v31.8b
+    smlal v16.4s, v20.4h, v2.4h
+    smlal2 v17.4s, v20.8h, v2.8h
+    st1 {v16.4s, v17.4s}, [x10], #32
+
+    sub x9, x9, #8
+    cmp x9, #8
+    bge LoopDepth8
+
+    L0:
+    cmp x9, #0
+    beq Loop16LineEnd
+
+    LoopDepth0:
+    ldrsb w14, [x7], #1
+    ldrsh w15, [x8], #2
+    ldr w16, [x0], #4
+    add w14, w14, w6
+
+    sxth w14, w14
+    madd w14, w14, w15, w16
+    str w14, [x10], #4
+
+    subs x9, x9, #1
+    bne LoopDepth0
+
+    Loop16LineEnd:
+
+subs x3, x3, #1
+add x1, x1, x5
+bne LoopOutPixel
+
+End:
+ret
+
+#endif
--- a/mindspore/lite/nnacl/int8/common_func.h
+++ b/mindspore/lite/nnacl/int8/common_func.h
@@ -49,6 +49,10 @@ void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, co
                      size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
                      size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier,
                      int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
+void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
+                   int output_channel, int input_step, int8_t input_zp);
+void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
+                          int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
 #endif

 #ifdef __cplusplus

--- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
@@ -20,6 +20,99 @@
 #include "nnacl/int8/common_func.h"

 /*conv depthwise int8 begin*/
+// only support perlayer
+#ifndef ENABLE_ARM64
+void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
+                   int output_channel, int input_step, int8_t input_zp) {
+  for (int i = 0; i < num_pixels; i++) {
+    for (int c = 0; c < output_channel; c++) {
+      const int16_t input = input_ptr[c] - input_zp;
+      *output_ptr++ += input * weight_ptr[c];
+    }
+    input_ptr += input_step;
+  }
+}
+#endif
+
+void ConvDwInt8Post(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
+                    int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max) {
+  int align_num = 0;
+#ifdef ENABLE_ARM64
+  align_num = num_pixels / 4 * 4;
+  ConvDwInt8PostAlign4(dst, buffer, align_num, output_zp, out_multiplier, left_shift, right_shift, acc_min, acc_max);
+#endif
+  for (int i = align_num; i < num_pixels; i++) {
+    buffer[i] = RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(buffer[i] * (1 << (unsigned int)left_shift), out_multiplier), -right_shift);
+    buffer[i] += output_zp;
+    buffer[i] = MSMAX(buffer[i], acc_min);
+    buffer[i] = MSMIN(buffer[i], acc_max);
+    dst[i] = (buffer[i]);
+  }
+}
+
+void ConvDwInt8(int8_t *output_data, int32_t *row_buffer, const int8_t *input_data, const int16_t *weight_data,
+                const int32_t *bias_data, const ConvParameter *conv_param, int task_id) {
+  int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
+  int h_start = h_step * task_id;
+  int h_end = MSMIN(h_start + h_step, conv_param->output_h_);
+
+  int out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_[0];
+  int left_shift = conv_param->conv_quant_arg_.left_shift_[0];
+  int right_shift = conv_param->conv_quant_arg_.right_shift_[0];
+
+  int intput_zp = conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
+  int output_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_;
+  int acc_min = conv_param->conv_quant_arg_.out_act_min_[0];
+  int acc_max = conv_param->conv_quant_arg_.out_act_max_[0];
+
+  for (int b = 0; b < conv_param->output_batch_; b++) {
+    const int8_t *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
+    int8_t *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
+    for (int oh = h_start; oh < h_end; oh++) {
+      int8_t *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_;
+
+      int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_;
+      int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_));
+      int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));
+
+      // init acc
+      for (int ow = 0; ow < conv_param->output_w_; ow++) {
+        memcpy(row_buffer + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(int32_t));
+      }
+      for (int kh = start_kh; kh < end_kh; kh++) {
+        int ih = ih_origin + conv_param->dilation_w_ * kh;
+
+        const int8_t *src_kh = src + ih * conv_param->input_w_ * conv_param->input_channel_;
+        const int16_t *weight_kh = weight_data + kh * conv_param->kernel_w_ * conv_param->output_channel_;
+
+        int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_;
+        for (int kw = 0; kw < conv_param->kernel_w_; kw++) {
+          int out_w_start = MSMAX(
+            0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
+          int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ -
+                                                        conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) /
+                                                         conv_param->stride_w_);
+
+          int32_t *acc_w = row_buffer + out_w_start * conv_param->output_channel_;
+          int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw;
+
+          const int8_t *src_kw = src_kh + iw_origin * conv_param->input_channel_;
+          int num_pixels = out_w_end - out_w_start;
+
+          ConvDwInt8Row(acc_w, src_kw, weight_kh, num_pixels, conv_param->output_channel_, in_sw_step, intput_zp);
+          weight_kh += conv_param->output_channel_;
+        }
+      }
+      // post func, acc int32 -> dst int8
+      ConvDwInt8Post(dst_data, row_buffer, conv_param->output_w_ * conv_param->output_channel_, output_zp,
+                     out_multiplier, left_shift, right_shift, acc_min, acc_max);
+    }
+  }
+}
+/*conv depthwise int8 end*/
+
+/*conv depthwise sliding window int8 begin*/
 void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
                              int width, int in_kh_step, int in_kw_step, int kernel_w, int *out_multiplier,
                              int *left_shift, int *right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max,
@@ -153,8 +246,8 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
 }
 #endif

-void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
-                const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
+void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
+                  const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
  const int16_t *src = input_data;
  int8_t *dst = output_data;
  bool per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
@@ -215,7 +308,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
  }  // batch loop
  // output nhwc4
 }
-/*conv depthwise int8 end*/
+/*conv depthwise sliding window int8 end*/

 /*deconv depthwise int8 begin*/
 void DeconvDepthwiseBorderPixelInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width,

--- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.h
+++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.h
@@ -23,8 +23,12 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
-                const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);
+
+void ConvDwInt8(int8_t *output_data, int32_t *output_row, const int8_t *input_data, const int16_t *weight_data,
+                const int32_t *bias_data, const ConvParameter *conv_param, int task_id);
+
+void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
+                  const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id);

 void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *input_data, const int16_t *weight_data,
                  const int32_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,

--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
@@ -15,6 +15,7 @@
 */

 #include "src/runtime/kernel/arm/int8/convolution_depthwise_int8.h"
+#include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -29,10 +30,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;

 namespace mindspore::kernel {
 ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
-  if (sliding != nullptr) {
-    delete sliding;
-    sliding = nullptr;
-  }
  if (packed_weight_ != nullptr) {
    free(packed_weight_);
    packed_weight_ = nullptr;
@@ -42,63 +39,44 @@ ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {

 int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
  // init weight, int8 -> int16
-  // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
  auto weight_tensor = in_tensors_[kWeightIndex];
  auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
-  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
-  int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
+  int channel = weight_tensor->Batch();
+  int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width();
+  auto tmp_weight = reinterpret_cast<int8_t *>(malloc(pack_weight_size * sizeof(int8_t)));
+  if (tmp_weight == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  PackNCHWToNHWCInt8(origin_weight, tmp_weight, 1, weight_tensor->Height() * weight_tensor->Width(),
+                     weight_tensor->Batch());
+
+  int weight_zp = conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_;
  packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
  if (packed_weight_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
-  PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
-                          weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
+  for (int i = 0; i < weight_tensor->ElementsNum(); i++) {
+    packed_weight_[i] = (int16_t)(tmp_weight[i] - weight_zp);
+  }

-  bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
+  bias_data_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
  if (bias_data_ == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
-  memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
+  memset(bias_data_, 0, channel * sizeof(int32_t));
  if (in_tensors_.size() == kInputSize2) {
    auto bias_tensor = in_tensors_.at(kBiasIndex);
    auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
    memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
  }

-  conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
-  return RET_OK;
-}
-
-int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
-  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
-                        UP_DIV(conv_param_->input_channel_, 4);
-  packed_input_ = reinterpret_cast<int16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(int16_t)));
-  if (packed_input_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
-  }
-
-  if (conv_param_->input_channel_ % C4NUM != 0) {
-    need_align_ = true;
-    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
-                           UP_DIV(conv_param_->output_channel_, C4NUM);
-    packed_output_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(pack_output_size * sizeof(int8_t)));
-    if (packed_input_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
-    }
-  }
  return RET_OK;
 }

 int ConvolutionDepthwiseInt8CPUKernel::Init() {
-  sliding = new (std::nothrow) SlidingWindowParam;
-  if (sliding == nullptr) {
-    MS_LOG(ERROR) << "new sliding window param.";
-    return RET_ERROR;
-  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
@@ -107,13 +85,12 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {

 int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
  ConvolutionBaseCPUKernel::Init();
-  InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
-
  auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set quant param failed.";
    return ret;
  }
+  conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_);
  ret = InitWeightBias();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!";
@@ -123,8 +100,9 @@ int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
 }

 int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
-  ConvDwInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
-             sliding, task_id);
+  auto buffer = row_buffer_ + conv_param_->output_w_ * conv_param_->output_channel_ * task_id;
+  ConvDwInt8(output_ptr_, buffer, input_ptr_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
+             task_id);
  return RET_OK;
 }

@@ -138,6 +116,16 @@ int ConvDwInt8Run(void *cdata, int task_id) {
  return RET_OK;
 }

+int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
+  int output_row_size = conv_param_->thread_num_ * conv_param_->output_w_ * conv_param_->output_channel_;
+  row_buffer_ = reinterpret_cast<int32_t *>(context_->allocator->Malloc(output_row_size * sizeof(float)));
+  if (row_buffer_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
 int ConvolutionDepthwiseInt8CPUKernel::Run() {
  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
    MS_LOG(ERROR) << "Only support input channel equals output channel.";
@@ -156,13 +144,10 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
  }

  auto input_tensor = in_tensors_.at(kInputIndex);
-  auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data());
-  PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
+  input_ptr_ = reinterpret_cast<int8_t *>(input_tensor->Data());

-  auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data());
-  if (!need_align_) {
-    packed_output_ = output_addr;
-  }
+  auto output_tensor = out_tensors_.at(kOutputIndex);
+  output_ptr_ = reinterpret_cast<int8_t *>(output_tensor->Data());

  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwInt8Run, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
@@ -170,12 +155,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
    return RET_ERROR;
  }

-  if (need_align_) {
-    PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_,
-                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
-    context_->allocator->Free(packed_output_);
-  }
-  context_->allocator->Free(packed_input_);
+  context_->allocator->Free(row_buffer_);
  return RET_OK;
 }

@@ -186,8 +166,14 @@ kernel::LiteKernel *CpuConvDwInt8KernelCreator(const std::vector<lite::tensor::T
                                               const mindspore::lite::PrimitiveC *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_DepthwiseConv2D);
-  auto kernel =
-    new (std::nothrow) kernel::ConvolutionDepthwiseInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  kernel::LiteKernel *kernel;
+  auto filter_quant_size = inputs[kWeightIndex]->GetQuantParams().size();
+  if (filter_quant_size == 1) {  // per tensor
+    kernel = new (std::nothrow) kernel::ConvolutionDepthwiseInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  } else {  // per channel
+    kernel =
+      new (std::nothrow) kernel::ConvolutionDepthwiseSWInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  }
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "kernel is nullptr.";
    return nullptr;

--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
@@ -36,15 +36,14 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
  int Run() override;

  int InitWeightBias();
-  int InitBuffer();
  int Execute(int task_id);

 private:
-  SlidingWindowParam *sliding = nullptr;
+  int InitBuffer();
  int16_t *packed_weight_ = nullptr;
-  int16_t *packed_input_ = nullptr;
-  int8_t *packed_output_ = nullptr;
-  bool need_align_ = false;
+  int8_t *input_ptr_ = nullptr;
+  int8_t *output_ptr_ = nullptr;
+  int32_t *row_buffer_ = nullptr;
 };
 }  // namespace mindspore::kernel


--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "nnacl/int8/conv_depthwise_int8.h"
+#include "src/runtime/runtime_api.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_DepthwiseConv2D;
+
+namespace mindspore::kernel {
+ConvolutionDepthwiseSWInt8CPUKernel::~ConvolutionDepthwiseSWInt8CPUKernel() {
+  if (sliding != nullptr) {
+    delete sliding;
+    sliding = nullptr;
+  }
+  if (packed_weight_ != nullptr) {
+    free(packed_weight_);
+    packed_weight_ = nullptr;
+  }
+  FreeQuantParam();
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::InitWeightBias() {
+  // init weight, int8 -> int16
+  // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
+  auto weight_tensor = in_tensors_[kWeightIndex];
+  auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
+  int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
+  int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
+  packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
+                          weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
+
+  bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
+  if (in_tensors_.size() == kInputSize2) {
+    auto bias_tensor = in_tensors_.at(kBiasIndex);
+    auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
+    memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
+  }
+
+  conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::InitBuffer() {
+  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
+                        UP_DIV(conv_param_->input_channel_, 4);
+  packed_input_ = reinterpret_cast<int16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(int16_t)));
+  if (packed_input_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+
+  if (conv_param_->input_channel_ % C4NUM != 0) {
+    need_align_ = true;
+    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
+                           UP_DIV(conv_param_->output_channel_, C4NUM);
+    packed_output_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(pack_output_size * sizeof(int8_t)));
+    if (packed_input_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::Init() {
+  sliding = new (std::nothrow) SlidingWindowParam;
+  if (sliding == nullptr) {
+    MS_LOG(ERROR) << "new sliding window param.";
+    return RET_ERROR;
+  }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::ReSize() {
+  ConvolutionBaseCPUKernel::Init();
+  InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
+
+  auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set quant param failed.";
+    return ret;
+  }
+  ret = InitWeightBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::Execute(int task_id) {
+  ConvDwSWInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_,
+               sliding, task_id);
+  return RET_OK;
+}
+
+int ConvDwSWInt8Run(void *cdata, int task_id) {
+  auto conv_dw_int8 = reinterpret_cast<ConvolutionDepthwiseSWInt8CPUKernel *>(cdata);
+  auto ret = conv_dw_int8->Execute(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ConvolutionDepthwiseSWInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseSWInt8CPUKernel::Run() {
+  if (conv_param_->input_channel_ != conv_param_->output_channel_) {
+    MS_LOG(ERROR) << "Only support input channel equals output channel.";
+    return RET_ERROR;
+  }
+  auto ret = Prepare();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare failed.";
+    return RET_ERROR;
+  }
+
+  ret = InitBuffer();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
+    return ret;
+  }
+
+  auto input_tensor = in_tensors_.at(kInputIndex);
+  auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data());
+  PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
+
+  auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data());
+  if (!need_align_) {
+    packed_output_ = output_addr;
+  }
+
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwSWInt8Run, this, conv_param_->thread_num_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ConvDwSWInt8Run error: error_code[" << ret << "]";
+    return RET_ERROR;
+  }
+
+  if (need_align_) {
+    PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_,
+                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+    context_->allocator->Free(packed_output_);
+  }
+  context_->allocator->Free(packed_input_);
+  return RET_OK;
+}
+
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
+#include "nnacl/fp32/conv_depthwise.h"
+
+namespace mindspore::kernel {
+class ConvolutionDepthwiseSWInt8CPUKernel : public ConvolutionBaseCPUKernel {
+ public:
+  ConvolutionDepthwiseSWInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                      const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
+                                      const mindspore::lite::PrimitiveC *primitive)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~ConvolutionDepthwiseSWInt8CPUKernel() override;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+  int InitWeightBias();
+  int InitBuffer();
+  int Execute(int task_id);
+
+ private:
+  SlidingWindowParam *sliding = nullptr;
+  int16_t *packed_weight_ = nullptr;
+  int16_t *packed_input_ = nullptr;
+  int8_t *packed_output_ = nullptr;
+  bool need_align_ = false;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_