!4739 [MS][LITE][Develop]add fp32 sliding window kernel

Merge pull request !4739 from lixian/master

!4739 [MS][LITE][Develop]add fp32 sliding window kernel
Merge pull request !4739 from lixian/master
0ec5a570 · mindspore-ci-bot · Gitee · 9ce6b36e · a5bd2548 · 0ec5a570
6 changed file
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
@@ -258,8 +258,7 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten
    kernel =
      new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
  } else if (use_sw) {
-    // kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
-    kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
+    kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  } else {
    kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  }

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S
@@ -18,7 +18,9 @@ ConvDwFp32Center:
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #48
+    sub sp, sp, #176
+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16
    stp x23, x24, [sp], #16
@@ -287,7 +289,9 @@ ConvDwFp32Center:
        subs x4, x4, #1
        bne LoopH

-    sub sp, sp, #48
+    sub sp, sp, #176
+    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x23, x24, [sp], #16

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S
@@ -19,7 +19,9 @@ ConvDwInt8Center:
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
-    sub sp, sp, #48
+    sub sp, sp, #176
+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16
    stp x23, x24, [sp], #16
@@ -631,7 +633,9 @@ ConvDwInt8Center:
        subs x4, x4, #1
        bne LoopH

-    sub sp, sp, #48
+    sub sp, sp, #176
+    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x23, x24, [sp], #16

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvFp32Center.S
+#ifdef __aarch64__
+
+.text
+.align 5
+.global ConvSwFp32Center
+#ifndef __APPLE__
+.type ConvSwFp32Center, %function
+#endif
+
+// void ConvSwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
+//                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t ic4, size_t in_sh_step,
+//                      size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
+// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w,
+// x8: out_h_step, x9: block_channel, x10: ic4, x11: in_sh_step, x12: in_sw_step, x13: in_kh_step, x14: in_kw_step
+// x26: relu, x16: relu6
+ConvSwFp32Center:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    sub sp, sp, #208
+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+    stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16
+    stp x23, x24, [sp], #16
+    stp x25, x26, [sp], #16
+    stp x27, x28, [sp], #16
+
+    ldr x8, [sp]
+    ldr x9, [sp, #8]
+    ldr x10, [sp, #16]
+    ldr x11, [sp, #24]
+    ldr x12, [sp, #32]
+    ldr x13, [sp, #40]
+    ldr x14, [sp, #48]
+    mul x15, x6, x7
+    mul x15, x10, x15
+    mov x16, #16
+    mul x15, x15, x16
+
+    ld1 {v25.4s}, [x3]
+    movi v26.4s, #6
+    scvtf v26.4s, v26.4s
+    dup v27.4s, wzr
+
+    LoopH:
+        mov x17, x1
+        mov x18, x5
+        mov x3, x0
+        cmp x18, #8
+        blt LoopW
+        cmp x18, #16
+        blt LoopW8
+
+        LoopW16:
+            mov x19, #16
+            mul x19, x19, x12
+            mov x20, x17
+            mov x21, x2
+            mov x22, x6
+            mov v0.16b, v25.16b
+            mov v1.16b, v25.16b
+            mov v2.16b, v25.16b
+            mov v3.16b, v25.16b
+            mov v4.16b, v25.16b
+            mov v5.16b, v25.16b
+            mov v6.16b, v25.16b
+            mov v7.16b, v25.16b
+            mov v8.16b, v25.16b
+            mov v9.16b, v25.16b
+            mov v10.16b, v25.16b
+            mov v11.16b, v25.16b
+            mov v12.16b, v25.16b
+            mov v13.16b, v25.16b
+            mov v14.16b, v25.16b
+            mov v15.16b, v25.16b
+            LoopKh16:
+                mov x23, x7
+                mov x24, x20
+                LoopKw16:
+                    mov x25, x24
+                    mov x27, x10
+                    LoopIc16:
+                        mov x26, x25
+                        mov x16, x21
+                        ld1 {v28.4s}, [x16], x15
+                        ld1 {v29.4s}, [x16], x15
+                        ld1 {v30.4s}, [x16], x15
+                        ld1 {v31.4s}, [x16], x15
+                        zip1 v20.4s, v28.4s, v29.4s
+                        zip2 v21.4s, v28.4s, v29.4s
+                        zip1 v22.4s, v30.4s, v31.4s
+                        zip2 v23.4s, v30.4s, v31.4s
+                        ld1 {v16.4s}, [x26], x12
+                        ld1 {v17.4s}, [x26], x12
+                        trn1 v28.2d, v20.2d, v22.2d
+                        trn2 v29.2d, v20.2d, v22.2d
+                        trn1 v30.2d, v21.2d, v23.2d
+                        trn2 v31.2d, v21.2d, v23.2d
+                        ld1 {v18.4s}, [x26], x12
+                        ld1 {v19.4s}, [x26], x12
+                        fmla v0.4s, v28.4s, v16.s[0]
+                        fmla v1.4s, v28.4s, v17.s[0]
+                        fmla v0.4s, v29.4s, v16.s[1]
+                        fmla v1.4s, v29.4s, v17.s[1]
+                        fmla v0.4s, v30.4s, v16.s[2]
+                        fmla v1.4s, v30.4s, v17.s[2]
+                        fmla v0.4s, v31.4s, v16.s[3]
+                        fmla v1.4s, v31.4s, v17.s[3]
+                        ld1 {v20.4s}, [x26], x12
+                        ld1 {v21.4s}, [x26], x12
+                        fmla v2.4s, v28.4s, v18.s[0]
+                        fmla v3.4s, v28.4s, v19.s[0]
+                        fmla v2.4s, v29.4s, v18.s[1]
+                        fmla v3.4s, v29.4s, v19.s[1]
+                        fmla v2.4s, v30.4s, v18.s[2]
+                        fmla v3.4s, v30.4s, v19.s[2]
+                        fmla v2.4s, v31.4s, v18.s[3]
+                        fmla v3.4s, v31.4s, v19.s[3]
+                        ld1 {v22.4s}, [x26], x12
+                        ld1 {v23.4s}, [x26], x12
+                        fmla v4.4s, v28.4s, v20.s[0]
+                        fmla v5.4s, v28.4s, v21.s[0]
+                        fmla v4.4s, v29.4s, v20.s[1]
+                        fmla v5.4s, v29.4s, v21.s[1]
+                        fmla v4.4s, v30.4s, v20.s[2]
+                        fmla v5.4s, v30.4s, v21.s[2]
+                        fmla v4.4s, v31.4s, v20.s[3]
+                        fmla v5.4s, v31.4s, v21.s[3]
+                        ld1 {v16.4s}, [x26], x12
+                        ld1 {v17.4s}, [x26], x12
+                        fmla v6.4s, v28.4s, v22.s[0]
+                        fmla v7.4s, v28.4s, v23.s[0]
+                        fmla v6.4s, v29.4s, v22.s[1]
+                        fmla v7.4s, v29.4s, v23.s[1]
+                        fmla v6.4s, v30.4s, v22.s[2]
+                        fmla v7.4s, v30.4s, v23.s[2]
+                        fmla v6.4s, v31.4s, v22.s[3]
+                        fmla v7.4s, v31.4s, v23.s[3]
+                        ld1 {v18.4s}, [x26], x12
+                        ld1 {v19.4s}, [x26], x12
+                        fmla v8.4s, v28.4s, v16.s[0]
+                        fmla v9.4s, v28.4s, v17.s[0]
+                        fmla v8.4s, v29.4s, v16.s[1]
+                        fmla v9.4s, v29.4s, v17.s[1]
+                        fmla v8.4s, v30.4s, v16.s[2]
+                        fmla v9.4s, v30.4s, v17.s[2]
+                        fmla v8.4s, v31.4s, v16.s[3]
+                        fmla v9.4s, v31.4s, v17.s[3]
+                        ld1 {v20.4s}, [x26], x12
+                        ld1 {v21.4s}, [x26], x12
+                        fmla v10.4s, v28.4s, v18.s[0]
+                        fmla v11.4s, v28.4s, v19.s[0]
+                        fmla v10.4s, v29.4s, v18.s[1]
+                        fmla v11.4s, v29.4s, v19.s[1]
+                        fmla v10.4s, v30.4s, v18.s[2]
+                        fmla v11.4s, v30.4s, v19.s[2]
+                        fmla v10.4s, v31.4s, v18.s[3]
+                        fmla v11.4s, v31.4s, v19.s[3]
+                        ld1 {v22.4s}, [x26], x12
+                        ld1 {v23.4s}, [x26], x12
+                        fmla v12.4s, v28.4s, v20.s[0]
+                        fmla v13.4s, v28.4s, v21.s[0]
+                        fmla v12.4s, v29.4s, v20.s[1]
+                        fmla v13.4s, v29.4s, v21.s[1]
+                        fmla v12.4s, v30.4s, v20.s[2]
+                        fmla v13.4s, v30.4s, v21.s[2]
+                        fmla v12.4s, v31.4s, v20.s[3]
+                        fmla v13.4s, v31.4s, v21.s[3]
+                        fmla v14.4s, v28.4s, v22.s[0]
+                        fmla v15.4s, v28.4s, v23.s[0]
+                        fmla v14.4s, v29.4s, v22.s[1]
+                        fmla v15.4s, v29.4s, v23.s[1]
+                        fmla v14.4s, v30.4s, v22.s[2]
+                        fmla v15.4s, v30.4s, v23.s[2]
+                        fmla v14.4s, v31.4s, v22.s[3]
+                        fmla v15.4s, v31.4s, v23.s[3]
+                        add x21, x21, #16
+                        add x25, x25, #16
+                        subs x27, x27, #1
+                        bgt LoopIc16
+                    subs x23, x23, #1
+                    add x24, x24, x14
+                    bne LoopKw16
+                add x20, x20, x13
+                subs x22, x22, #1
+                bne LoopKh16
+            ldr x16, [sp, #64]
+            cbnz x16, Relu616
+            ldr x26, [sp, #56]
+            cbnz x26, Relu16
+            b Write16
+        Relu616:
+            fmin v0.4s, v0.4s, v26.4s
+            fmin v1.4s, v1.4s, v26.4s
+            fmin v2.4s, v2.4s, v26.4s
+            fmin v3.4s, v3.4s, v26.4s
+            fmin v4.4s, v4.4s, v26.4s
+            fmin v5.4s, v5.4s, v26.4s
+            fmin v6.4s, v6.4s, v26.4s
+            fmin v7.4s, v7.4s, v26.4s
+            fmin v8.4s, v8.4s, v26.4s
+            fmin v9.4s, v9.4s, v26.4s
+            fmin v10.4s, v10.4s, v26.4s
+            fmin v11.4s, v11.4s, v26.4s
+            fmin v12.4s, v12.4s, v26.4s
+            fmin v13.4s, v13.4s, v26.4s
+            fmin v14.4s, v14.4s, v26.4s
+            fmin v15.4s, v15.4s, v26.4s
+        Relu16:
+            fmax v0.4s, v0.4s, v27.4s
+            fmax v1.4s, v1.4s, v27.4s
+            fmax v2.4s, v2.4s, v27.4s
+            fmax v3.4s, v3.4s, v27.4s
+            fmax v4.4s, v4.4s, v27.4s
+            fmax v5.4s, v5.4s, v27.4s
+            fmax v6.4s, v6.4s, v27.4s
+            fmax v7.4s, v7.4s, v27.4s
+            fmax v8.4s, v8.4s, v27.4s
+            fmax v9.4s, v9.4s, v27.4s
+            fmax v10.4s, v10.4s, v27.4s
+            fmax v11.4s, v11.4s, v27.4s
+            fmax v12.4s, v12.4s, v27.4s
+            fmax v13.4s, v13.4s, v27.4s
+            fmax v14.4s, v14.4s, v27.4s
+            fmax v15.4s, v15.4s, v27.4s
+        Write16:
+            st1 {v0.4s}, [x3], x9
+            st1 {v1.4s}, [x3], x9
+            st1 {v2.4s}, [x3], x9
+            st1 {v3.4s}, [x3], x9
+            st1 {v4.4s}, [x3], x9
+            st1 {v5.4s}, [x3], x9
+            st1 {v6.4s}, [x3], x9
+            st1 {v7.4s}, [x3], x9
+            st1 {v8.4s}, [x3], x9
+            st1 {v9.4s}, [x3], x9
+            st1 {v10.4s}, [x3], x9
+            st1 {v11.4s}, [x3], x9
+            st1 {v12.4s}, [x3], x9
+            st1 {v13.4s}, [x3], x9
+            st1 {v14.4s}, [x3], x9
+            st1 {v15.4s}, [x3], x9
+            add x17, x17, x19
+            sub x18, x18, #16
+            cmp x18, #0
+            ble LoopWEnd
+            cmp x18, #8
+            blt LoopW
+            cmp x18, #16
+            bge LoopW16
+        LoopW8:
+            mov x19, #8
+            mul x19, x19, x12
+            mov x20, x17
+            mov x21, x2
+            mov x22, x6
+            mov v0.16b, v25.16b
+            mov v1.16b, v25.16b
+            mov v2.16b, v25.16b
+            mov v3.16b, v25.16b
+            mov v4.16b, v25.16b
+            mov v5.16b, v25.16b
+            mov v6.16b, v25.16b
+            mov v7.16b, v25.16b
+            LoopKh8:
+                mov x23, x7
+                mov x24, x20
+                LoopKw8:
+                    mov x25, x24
+                    mov x27, x10
+                    LoopIc8:
+                        mov x26, x25
+                        mov x16, x21
+                        ld1 {v28.4s}, [x16], x15
+                        ld1 {v29.4s}, [x16], x15
+                        ld1 {v30.4s}, [x16], x15
+                        ld1 {v31.4s}, [x16], x15
+                        zip1 v20.4s, v28.4s, v29.4s
+                        zip2 v21.4s, v28.4s, v29.4s
+                        zip1 v22.4s, v30.4s, v31.4s
+                        zip2 v23.4s, v30.4s, v31.4s
+                        ld1 {v16.4s}, [x26], x12
+                        ld1 {v17.4s}, [x26], x12
+                        trn1 v28.2d, v20.2d, v22.2d
+                        trn2 v29.2d, v20.2d, v22.2d
+                        trn1 v30.2d, v21.2d, v23.2d
+                        trn2 v31.2d, v21.2d, v23.2d
+                        ld1 {v18.4s}, [x26], x12
+                        ld1 {v19.4s}, [x26], x12
+                        fmla v0.4s, v28.4s, v16.s[0]
+                        fmla v1.4s, v28.4s, v17.s[0]
+                        fmla v0.4s, v29.4s, v16.s[1]
+                        fmla v1.4s, v29.4s, v17.s[1]
+                        fmla v0.4s, v30.4s, v16.s[2]
+                        fmla v1.4s, v30.4s, v17.s[2]
+                        fmla v0.4s, v31.4s, v16.s[3]
+                        fmla v1.4s, v31.4s, v17.s[3]
+                        ld1 {v20.4s}, [x26], x12
+                        ld1 {v21.4s}, [x26], x12
+                        fmla v2.4s, v28.4s, v18.s[0]
+                        fmla v3.4s, v28.4s, v19.s[0]
+                        fmla v2.4s, v29.4s, v18.s[1]
+                        fmla v3.4s, v29.4s, v19.s[1]
+                        fmla v2.4s, v30.4s, v18.s[2]
+                        fmla v3.4s, v30.4s, v19.s[2]
+                        fmla v2.4s, v31.4s, v18.s[3]
+                        fmla v3.4s, v31.4s, v19.s[3]
+                        ld1 {v22.4s}, [x26], x12
+                        ld1 {v23.4s}, [x26], x12
+                        fmla v4.4s, v28.4s, v20.s[0]
+                        fmla v5.4s, v28.4s, v21.s[0]
+                        fmla v4.4s, v29.4s, v20.s[1]
+                        fmla v5.4s, v29.4s, v21.s[1]
+                        fmla v4.4s, v30.4s, v20.s[2]
+                        fmla v5.4s, v30.4s, v21.s[2]
+                        fmla v4.4s, v31.4s, v20.s[3]
+                        fmla v5.4s, v31.4s, v21.s[3]
+                        fmla v6.4s, v28.4s, v22.s[0]
+                        fmla v7.4s, v28.4s, v23.s[0]
+                        fmla v6.4s, v29.4s, v22.s[1]
+                        fmla v7.4s, v29.4s, v23.s[1]
+                        fmla v6.4s, v30.4s, v22.s[2]
+                        fmla v7.4s, v30.4s, v23.s[2]
+                        fmla v6.4s, v31.4s, v22.s[3]
+                        fmla v7.4s, v31.4s, v23.s[3]
+                        add x21, x21, #16
+                        add x25, x25, #16
+                        subs x27, x27, #1
+                        bgt LoopIc8
+                    subs x23, x23, #1
+                    add x24, x24, x14
+                    bne LoopKw8
+                add x20, x20, x13
+                subs x22, x22, #1
+                bne LoopKh8
+            ldr x16, [sp, #64]
+            cbnz x16, Relu68
+            ldr x26, [sp, #56]
+            cbnz x26, Relu8
+            b Write8
+        Relu68:
+            fmin v0.4s, v0.4s, v26.4s
+            fmin v1.4s, v1.4s, v26.4s
+            fmin v2.4s, v2.4s, v26.4s
+            fmin v3.4s, v3.4s, v26.4s
+            fmin v4.4s, v4.4s, v26.4s
+            fmin v5.4s, v5.4s, v26.4s
+            fmin v6.4s, v6.4s, v26.4s
+            fmin v7.4s, v7.4s, v26.4s
+        Relu8:
+            fmax v0.4s, v0.4s, v27.4s
+            fmax v1.4s, v1.4s, v27.4s
+            fmax v2.4s, v2.4s, v27.4s
+            fmax v3.4s, v3.4s, v27.4s
+            fmax v4.4s, v4.4s, v27.4s
+            fmax v5.4s, v5.4s, v27.4s
+            fmax v6.4s, v6.4s, v27.4s
+            fmax v7.4s, v7.4s, v27.4s
+        Write8:
+            st1 {v0.4s}, [x3], x9
+            st1 {v1.4s}, [x3], x9
+            st1 {v2.4s}, [x3], x9
+            st1 {v3.4s}, [x3], x9
+            st1 {v4.4s}, [x3], x9
+            st1 {v5.4s}, [x3], x9
+            st1 {v6.4s}, [x3], x9
+            st1 {v7.4s}, [x3], x9
+            add x17, x17, x19
+            sub x18, x18, #8
+            cmp x18, #0
+            ble LoopWEnd
+            cmp x18, #8
+            bge LoopW8
+        LoopW:
+            mov x20, x17
+            mov x21, x2
+            mov x22, x6
+            mov v0.16b, v25.16b
+            LoopKh:
+                mov x23, x7
+                mov x24, x20
+                LoopKw:
+                    mov x25, x24
+                    mov x27, x10
+                    LoopIc:
+                        mov x26, x25
+                        mov x16, x21
+                        ld1 {v28.4s}, [x16], x15
+                        ld1 {v29.4s}, [x16], x15
+                        ld1 {v30.4s}, [x16], x15
+                        ld1 {v31.4s}, [x16], x15
+                        zip1 v20.4s, v28.4s, v29.4s
+                        zip2 v21.4s, v28.4s, v29.4s
+                        zip1 v22.4s, v30.4s, v31.4s
+                        zip2 v23.4s, v30.4s, v31.4s
+                        ld1 {v16.4s}, [x26], x12
+                        trn1 v28.2d, v20.2d, v22.2d
+                        trn2 v29.2d, v20.2d, v22.2d
+                        trn1 v30.2d, v21.2d, v23.2d
+                        trn2 v31.2d, v21.2d, v23.2d
+                        fmla v0.4s, v28.4s, v16.s[0]
+                        fmla v0.4s, v29.4s, v16.s[1]
+                        fmla v0.4s, v30.4s, v16.s[2]
+                        fmla v0.4s, v31.4s, v16.s[3]
+                        add x21, x21, #16
+                        add x25, x25, #16
+                        subs x27, x27, #1
+                        bgt LoopIc
+                    subs x23, x23, #1
+                    add x24, x24, x14
+                    bne LoopKw
+                add x20, x20, x13
+                subs x22, x22, #1
+                bne LoopKh
+            ldr x16, [sp, #64]
+            cbnz x16, Relu6
+            ldr x26, [sp, #56]
+            cbnz x26, Relu
+            b Write
+        Relu6:
+            fmin v0.4s, v0.4s, v26.4s
+        Relu:
+            fmax v0.4s, v0.4s, v27.4s
+        Write:
+            st1 {v0.4s}, [x3], x9
+            add x17, x17, x12
+            subs x18, x18, #1
+            bne LoopW
+    LoopWEnd:
+        add x0, x0, x8
+        add x1, x1, x11
+        subs x4, x4, #1
+        bne LoopH
+
+    sub sp, sp, #208
+    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+    ldp x19, x20, [sp], #16
+    ldp x21, x22, [sp], #16
+    ldp x23, x24, [sp], #16
+    ldp x25, x26, [sp], #16
+    ldp x27, x28, [sp], #16
+    ret
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h
@@ -71,6 +71,11 @@ void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_

 void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod,
                        size_t plane_size, size_t stride, size_t relu_type);
+
+void ConvSwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height,
+                      size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
+                      size_t ic4, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step,
+                      size_t relu, size_t relu6);
 #endif

 #ifdef __cplusplus

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c
@@ -16,6 +16,7 @@

 #include "nnacl/fp32/conv.h"
 #include <string.h>
+#include "nnacl/fp32/common_func.h"
 #include "nnacl/winograd_transform.h"

 void SWBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width,
@@ -83,6 +84,7 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi
  }  // height loop
 }

+#ifndef ENABLE_ARM64
 void SWCenter(float *dst, const float *src, const float *weight, const float *bias, int height, int width, int kernel_h,
              int kernel_w, int out_h_step, int block_channel, int ic4, int in_sh_step, int in_sw_step, int in_kh_step,
              int in_kw_step, bool is_relu, bool is_relu6) {
@@ -135,6 +137,7 @@ void SWCenter(float *dst, const float *src, const float *weight, const float *bi
    src_h += in_sh_step;
  }  // dst_height loop
 }
+#endif

 // fp32 sliding window
 void ConvSWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *tmp_out_block,
@@ -172,11 +175,23 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
          src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_;
        float *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ +
                       slidingWindow_param->left_ * slidingWindow_param->block_channel_;
+#ifdef ENABLE_ARM64
+        ConvSwFp32Center(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
+                         slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
+                         conv_param->kernel_w_, slidingWindow_param->out_h_step_ * sizeof(float),
+                         slidingWindow_param->block_channel_ * sizeof(float), ic4,
+                         slidingWindow_param->in_sh_step_ * sizeof(float),
+                         slidingWindow_param->in_sw_step_ * sizeof(float),
+                         slidingWindow_param->in_kh_step_ * sizeof(float),
+                         slidingWindow_param->in_kw_step_ * sizeof(float),
+                         conv_param->is_relu_, conv_param->is_relu6_);
+#else
        SWCenter(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
-                 slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_,
-                 slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
+                 slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
+                 conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
                 slidingWindow_param->in_sh_step_, slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_,
                 slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
+#endif
      }
    }  // output C4 loop
    src += slidingWindow_param->in_step_;