From e52aa2b12f244b73ff3108fb92e14072b5d3bdf5 Mon Sep 17 00:00:00 2001 From: lixian <179220644@qq.com> Date: Fri, 31 Jul 2020 21:07:33 +0800 Subject: [PATCH] fix compile bugs on arm32 and add BiasAdd/Relu/Dw arm64 kernels --- .../assembly/arm32/IndirectGemmFp32_8x4.S | 180 +++++++++--------- .../assembly/arm32/IndirectGemmInt8_2x4.S | 65 ++++--- .../arm/opclib/assembly/arm64/C4BiasAdd.S | 131 +++++++++++++ .../arm/opclib/assembly/arm64/C4BiasAddRelu.S | 137 +++++++++++++ .../opclib/assembly/arm64/C4BiasAddRelu6.S | 146 ++++++++++++++ .../kernel/arm/opclib/assembly/arm64/C4Relu.S | 132 +++++++++++++ .../arm/opclib/assembly/arm64/C4Relu6.S | 140 ++++++++++++++ .../opclib/assembly/arm64/ConvDwFp32Center.S | 96 ++++++++++ .../assembly/arm64/DeconvDwFp32Center.S | 77 ++++++++ .../kernel/arm/opclib/fp32/common_func.cc | 11 +- .../kernel/arm/opclib/fp32/common_func.h | 11 ++ .../kernel/arm/opclib/fp32/conv_depthwise.cc | 11 ++ 12 files changed, 1019 insertions(+), 118 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAdd.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu6.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu6.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmFp32_8x4.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmFp32_8x4.S index 4e4e8e27b..d7f924b3a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmFp32_8x4.S +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmFp32_8x4.S @@ -1,4 +1,5 @@ -#ifdef __aarch64__ +#ifdef __arm__ +#ifndef __aarch64__ .text .align 5 @@ -10,15 +11,16 @@ // void IndirectGemmFp32_8x4(float *output, float *input, float *weight, float *bias, // size_t kSize, size_t ic4, size_t oc8, size_t offset, size_t mode, size_t writeC4, size_t relu, size_t relu6); // r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset -// r8:mode, r10: writeMode, x10: relu, r10:relu6 +// r8:mode, r10: writeMode, r10: relu, r10:relu6 // mode = 0 for general convolution, where one conv unit is a row // mode = 1 for winograd/common gemm, where the total channels of one input is a row IndirectGemmFp32_8x4: .macro INIT_BIAS veor q10, q10, q10 - cbz x3, InitBias - vld1.32 q10, [x3] + cmp r3, #0 + beq InitBias + vld1.32 q10, [r3] InitBias: vmov q11, q10 vmov q12, q10 @@ -27,10 +29,11 @@ IndirectGemmFp32_8x4: vmov q15, q10 .endm - // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to - // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers - // r19 ~ r29 should be also preserved - // whereas our coding style do not permit such amount of parameters + // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" + // according to https://stackoverflow.com/questions/53625807 + // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway + // clang's rule seems more simple, though there are no subroutine calls here + // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf push {r4-r8, r10, r11, lr} vpush {q4-q7} add sp, sp, #160 @@ -41,7 +44,8 @@ IndirectGemmFp32_8x4: ldr r7, [sp, #12] ldr r8, [sp, #16] - cbnz r8, LoopOc + cmp r8, #0 + bne LoopOc // step is one for common convolution, where ic8 should multiply by kernel size // step is (a+b-1) for F(a,b) in winograd mul r5, r4, r5 @@ -57,17 +61,18 @@ IndirectGemmFp32_8x4: INIT_BIAS // load input for output 1-2 - vld1.32 {q0, q1, q2, q3}, [x12]! + vld1.32 {q0, q1}, [r12]! + vld1.32 {q2, q3}, [r12]! // load weight - vld1.32 {q4, q5}, [x2]! + vld1.32 {q4, q5}, [r2]! // step for output 1-2 vmul.f32 q8, q4, d0[0] vmul.f32 q9, q4, d2[0] vmla.f32 q8, q5, d0[1] vmla.f32 q9, q5, d2[1] - vld1.32 {q6, q7}, [x2]! + vld1.32 {q6, q7}, [r2]! - subs x10, x5, #1 + subs r10, r5, #1 beq LoopIcEnd LoopIc: @@ -146,9 +151,11 @@ IndirectGemmFp32_8x4: vmla.f32 q15, q7, d7[1] ldr r10, [sp, #28] - cbnz r10, Relu6 + cmp r10, #0 + bne Relu6 ldr r10, [sp, #24] - cbnz x10, Relu + cmp r10, #0 + bne Relu b WriteStart Relu6: vmov.i32 q14, #6 @@ -174,7 +181,8 @@ IndirectGemmFp32_8x4: WriteStart: ldr r10, [sp, #20] - cbnz x10, WriteC4 + cmp r10, #0 + bne WriteC4 cmp r6, #1 beq Write1 cmp r6, #2 @@ -183,97 +191,97 @@ IndirectGemmFp32_8x4: beq Write3 b Write4 Write1: - str s0, [r11] - add r11, r11, x7 - str s4, [r11] - add r11, r11, x7 - str s8, [r11] - add r11, r11, x7 - str s12, [r11] - add r11, r11, x7 - str s16, [r11] - add r11, r11, x7 - str s20, [r11] - add r11, r11, x7 - str s24, [r11] - add r11, r11, x7 - str s28, [r11] + vst1.32 d0[0], [r11] + add r11, r11, r7 + vst1.32 d2[0], [r11] + add r11, r11, r7 + vst1.32 d4[0], [r11] + add r11, r11, r7 + vst1.32 d6[0], [r11] + add r11, r11, r7 + vst1.32 d8[0], [r11] + add r11, r11, r7 + vst1.32 d10[0], [r11] + add r11, r11, r7 + vst1.32 d12[0], [r11] + add r11, r11, r7 + vst1.32 d14[0], [r11] add r0, r0, #4 b WriteEnd Write2: - str d0, [r11] - add r11, r11, x7 - str d2, [r11] - add r11, r11, x7 - str d4, [r11] - add r11, r11, x7 - str d6, [r11] - add r11, r11, x7 - str d8, [r11] - add r11, r11, x7 - str d10, [r11] - add r11, r11, x7 - str d12, [r11] - add r11, r11, x7 - str d14, [r11] + vst1.32 d0, [r11] + add r11, r11, r7 + vst1.32 d2, [r11] + add r11, r11, r7 + vst1.32 d4, [r11] + add r11, r11, r7 + vst1.32 d6, [r11] + add r11, r11, r7 + vst1.32 d8, [r11] + add r11, r11, r7 + vst1.32 d10, [r11] + add r11, r11, r7 + vst1.32 d12, [r11] + add r11, r11, r7 + vst1.32 d14, [r11] add r0, r0, #8 b WriteEnd Write3: add r12, r11, #8 - str d0, [r11] - add r11, r11, x7 - str s2, [r12] + vst1.32 d0, [r11] + add r11, r11, r7 + vst1.32 d1[0], [r12] add r12, r12, r7 - str d2, [r11] - add r11, r11, x7 - str s6, [r12] + vst1.32 d2, [r11] + add r11, r11, r7 + vst1.32 d3[0], [r12] add r12, r12, r7 - str d4, [r11] - add r11, r11, x7 - str s10, [r12] + vst1.32 d4, [r11] + add r11, r11, r7 + vst1.32 d5[0], [r12] add r12, r12, r7 - str d6, [r11] - add r11, r11, x7 - str s14, [r12] + vst1.32 d6, [r11] + add r11, r11, r7 + vst1.32 d7[0], [r12] add r12, r12, r7 - str d8, [r11] - add r11, r11, x7 - str s18, [r12] + vst1.32 d8, [r11] + add r11, r11, r7 + vst1.32 d9[0], [r12] add r12, r12, r7 - str d10, [r11] - add r11, r11, x7 - str s22, [r12] + vst1.32 d10, [r11] + add r11, r11, r7 + vst1.32 d11[0], [r12] add r12, r12, r7 - str d12, [r11] - add r11, r11, x7 - str s26, [r12] + vst1.32 d12, [r11] + add r11, r11, r7 + vst1.32 d13[0], [r12] add r12, r12, r7 - str d14, [r11] - str s30, [r12] + vst1.32 d14, [r11] + vst1.32 d15[0], [r12] add r0, r0, #12 b WriteEnd WriteC4: - vst1.32 q0, [r11], x7 - vst1.32 q1, [r11], x7 - vst1.32 q2, [r11], x7 - vst1.32 q3, [r11], x7 - vst1.32 q4, [r11], x7 - vst1.32 q5, [r11], x7 - vst1.32 q6, [r11], x7 + vst1.32 q0, [r11], r7 + vst1.32 q1, [r11], r7 + vst1.32 q2, [r11], r7 + vst1.32 q3, [r11], r7 + vst1.32 q4, [r11], r7 + vst1.32 q5, [r11], r7 + vst1.32 q6, [r11], r7 vst1.32 q7, [r11] add r0, r0, #16 b WriteEnd Write4: // prefetching is not prefered while writing results in spite of cache missings - // you could try prfm pstl2strm + // you could try prfm pstl2vst1.32m // there are almost no benefits observed though - vst1.32 q0, [r11], x7 - vst1.32 q1, [r11], x7 - vst1.32 q2, [r11], x7 - vst1.32 q3, [r11], x7 - vst1.32 q4, [r11], x7 - vst1.32 q5, [r11], x7 - vst1.32 q6, [r11], x7 + vst1.32 q0, [r11], r7 + vst1.32 q1, [r11], r7 + vst1.32 q2, [r11], r7 + vst1.32 q3, [r11], r7 + vst1.32 q4, [r11], r7 + vst1.32 q5, [r11], r7 + vst1.32 q6, [r11], r7 vst1.32 q7, [r11] add r0, r0, #16 @@ -283,7 +291,8 @@ IndirectGemmFp32_8x4: bne LoopKsize subs r6, r6, #4 - cbz r3, NoStepFowrard + cmp r3, #0 + beq NoStepFowrard add r3, r3, #16 NoStepFowrard: bgt LoopOc @@ -292,3 +301,4 @@ IndirectGemmFp32_8x4: vpop {q4-q7} pop {r4-r8, r10, r11, pc} #endif +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmInt8_2x4.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmInt8_2x4.S index 6a2309449..0e71c5971 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmInt8_2x4.S +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmInt8_2x4.S @@ -1,4 +1,5 @@ -#ifdef __aarch64__ +#ifdef __arm__ +#ifndef __aarch64__ .text .align 5 @@ -10,8 +11,8 @@ // void IndirectGemmInt8_2x4(int8_t *output, int8_t *input, int8_t *weight, int32_t *bias, size_t ksize, size_t ic4, // size_t oc, size_t offset, int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp, size_t out_multiplier, // size_t shift_before, size_t shift_after); -// x0: output, x1: input, r2: weight, x3: bias, x4: kSize, x5: ic4, x6: oc, x7: offset -// x8: input_sum, x10: act_min, x11: act_max, x10: out_zp, x11: out_multiplier, x10: shift_before, x11: shift_after +// r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset +// r8: input_sum, r10: act_min, r11: act_max, r10: out_zp, r11: out_multiplier, r10: shift_before, r11: shift_after IndirectGemmInt8_2x4: .macro INIT_BIAS @@ -73,7 +74,7 @@ IndirectGemmInt8_2x4: vmlal.s8 q6, d1, d9 vmlal.s8 q7, d1, d11 - subs x10, x5, #1 + subs r10, r5, #1 beq LoopIcEnd LoopIc: @@ -107,7 +108,7 @@ IndirectGemmInt8_2x4: vmlal.s8 q6, d1, d9 vmlal.s8 q7, d1, d11 - subs x10, x10, #1 + subs r10, r10, #1 bne LoopIc LoopIcEnd: @@ -131,15 +132,23 @@ IndirectGemmInt8_2x4: vld1.32 q0[], [r10]! vld1.32 q1[], [r10]! // pairwise add - vpadd.i32 q8, q8, q9 - vpadd.i32 q10, q10, q11 - vpadd.i32 q12, q12, q13 - vpadd.i32 q14, q14, q15 - vpadd.i32 q8, q8, q10 - vpadd.i32 q12, q12, q14 + vpadd.i32 d16, d16, d17 + vpadd.i32 d18, d18, d19 + vpadd.i32 d20, d20, d21 + vpadd.i32 d22, d22, d23 + vpadd.i32 d24, d24, d25 + vpadd.i32 d26, d26, d27 + vpadd.i32 d28, d28, d29 + vpadd.i32 d30, d30, d31 + + vpadd.i32 d16, d16, d18 + vpadd.i32 d17, d20, d22 + vpadd.i32 d24, d24, d26 + vpadd.i32 d25, d28, d30 vsub.i32 q8, q8, q0 vsub.i32 q12, q12, q1 - cbz r3, NoBias + cmp r3, #0 + beq NoBias vld1.32 q2, [r3] vadd.i32 q8, q8, q2 vadd.i32 q12, q12, q2 @@ -182,34 +191,34 @@ IndirectGemmInt8_2x4: // prefetching is not prefered while writing results in spite of cache missings // you could try prfm pstl2strm WriteStart: - cmp x6, #1 + cmp r6, #1 beq Write1 - cmp x6, #2 + cmp r6, #2 beq Write2 - cmp x6, #3 + cmp r6, #3 beq Write3 b Write4 Write1: - vst1.8 {d0[0]}, [x11], x7 - vst1.8 {d0[1]}, [x11] + vst1.8 {d0[0]}, [r11], r7 + vst1.8 {d0[1]}, [r11] add r0, r0, #1 b WriteEnd Write2: - vst1.16 {d0[0]}, [x11], x7 - vst1.16 {d0[1]}, [x11] + vst1.16 {d0[0]}, [r11], r7 + vst1.16 {d0[1]}, [r11] add r0, r0, #2 b WriteEnd Write3: - add x14, x11, #2 - vst1.16 {d0[0]}, [x11], x7 - vst1.16 {d0[1]}, [x11] - vst1.8 {d0[0]}, [x14], x7 - vst1.8 {d0[1]}, [x14] + add r14, r11, #2 + vst1.16 {d0[0]}, [r11], r7 + vst1.16 {d0[1]}, [r11] + vst1.8 {d0[0]}, [r14], r7 + vst1.8 {d0[1]}, [r14] add r0, r0, #3 b WriteEnd Write4: - vst1.32 {d0[0]}, [x11], x7 - vst1.32 {d0[1]}, [x11] + vst1.32 {d0[0]}, [r11], r7 + vst1.32 {d0[1]}, [r11] add r0, r0, #4 WriteEnd: @@ -218,7 +227,8 @@ IndirectGemmInt8_2x4: bne LoopKsize subs r6, r6, #4 - cbz r3, NoStepFowrard + cmp r3, #0 + beq NoStepFowrard add r3, r3, #16 NoStepFowrard: bgt LoopOc @@ -227,3 +237,4 @@ IndirectGemmInt8_2x4: vpop {q4-q7} pop {r4-r8, r10, r11, pc} #endif +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAdd.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAdd.S new file mode 100644 index 000000000..d59b0239f --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAdd.S @@ -0,0 +1,131 @@ +#ifdef __aarch64__ + + .text + .align 5 + //.p2align 5,,15 + .global C4BiasAdd +#ifndef __APPLE__ + .type C4BiasAdd, %function +#endif + +//void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) +//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride + +C4BiasAdd: + + LoopOc: + ld1 {v4.4s}, [x2], #16 + mov x6, x4 + mov x7, x0 + cmp x6, #4 + blt Loop1 + + Loop4: + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 + fadd v0.4s, v0.4s, v4.4s + fadd v1.4s, v1.4s, v4.4s + fadd v2.4s, v2.4s, v4.4s + fadd v3.4s, v3.4s, v4.4s + + cmp x3, #4 + bge Write4x4 + cmp x3, #3 + beq Write3x4 + cmp x3, #2 + beq Write2x4 + + Write1x4: + str s0, [x7] + add x7, x7, x5 + str s1, [x7] + add x7, x7, x5 + str s2, [x7] + add x7, x7, x5 + str s3, [x7] + add x7, x7, x5 + b WriteEndx4 + Write2x4: + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + dup s17, v1.s[1] + stp s1, s17, [x7] + add x7, x7, x5 + dup s18, v2.s[1] + stp s2, s18, [x7] + add x7, x7, x5 + dup s19, v3.s[1] + stp s3, s19, [x7] + add x7, x7, x5 + b WriteEndx4 + Write3x4: + add x8, x7, #8 + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + st1 {v0.s}[2], [x8], x5 + dup s17, v1.s[1] + stp s1, s17, [x7] + add x7, x7, x5 + st1 {v1.s}[2], [x8], x5 + dup s18, v2.s[1] + stp s2, s18, [x7] + add x7, x7, x5 + st1 {v2.s}[2], [x8], x5 + dup s19, v3.s[1] + stp s3, s19, [x7] + add x7, x7, x5 + st1 {v3.s}[2], [x8], x5 + b WriteEndx4 + Write4x4: + st1 {v0.4s}, [x7], x5 + st1 {v1.4s}, [x7], x5 + st1 {v2.4s}, [x7], x5 + st1 {v3.4s}, [x7], x5 + + WriteEndx4: + subs x6, x6, #4 + beq LoopOcEnd + cmp x6, #4 + blt Loop1 + b Loop4 + + Loop1: + ld1 {v0.4s}, [x1], #16 + fadd v0.4s, v0.4s, v4.4s + + cmp x3, #4 + bge Write4 + cmp x3, #3 + beq Write3 + cmp x3, #2 + beq Write2 + + Write1: + str s0, [x7] + add x7, x7, x5 + b WriteEnd + Write2: + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + b WriteEnd + Write3: + add x8, x7, #8 + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + st1 {v0.s}[2], [x8], x5 + b WriteEnd + Write4: + st1 {v0.4s}, [x7], x5 + WriteEnd: + subs x6, x6, #1 + bne Loop1 + LoopOcEnd: + subs x3, x3, #4 + add x0, x0, #16 + bgt LoopOc + +ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu.S new file mode 100644 index 000000000..6a464b400 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu.S @@ -0,0 +1,137 @@ +#ifdef __aarch64__ + + .text + .align 5 + //.p2align 5,,15 + .global C4BiasAddRelu +#ifndef __APPLE__ + .type C4BiasAddRelu, %function +#endif + +//void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) +//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride + +C4BiasAddRelu: + dup v5.4s, wzr + LoopOc: + ld1 {v4.4s}, [x2], #16 + mov x6, x4 + mov x7, x0 + cmp x6, #4 + blt Loop1 + + Loop4: + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 + fadd v0.4s, v0.4s, v4.4s + fadd v1.4s, v1.4s, v4.4s + fadd v2.4s, v2.4s, v4.4s + fadd v3.4s, v3.4s, v4.4s + + fmax v0.4s, v0.4s, v5.4s + fmax v1.4s, v1.4s, v5.4s + fmax v2.4s, v2.4s, v5.4s + fmax v3.4s, v3.4s, v5.4s + + cmp x3, #4 + bge Write4x4 + cmp x3, #3 + beq Write3x4 + cmp x3, #2 + beq Write2x4 + + Write1x4: + str s0, [x7] + add x7, x7, x5 + str s1, [x7] + add x7, x7, x5 + str s2, [x7] + add x7, x7, x5 + str s3, [x7] + add x7, x7, x5 + b WriteEndx4 + Write2x4: + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + dup s17, v1.s[1] + stp s1, s17, [x7] + add x7, x7, x5 + dup s18, v2.s[1] + stp s2, s18, [x7] + add x7, x7, x5 + dup s19, v3.s[1] + stp s3, s19, [x7] + add x7, x7, x5 + b WriteEndx4 + Write3x4: + add x8, x7, #8 + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + st1 {v0.s}[2], [x8], x5 + dup s17, v1.s[1] + stp s1, s17, [x7] + add x7, x7, x5 + st1 {v1.s}[2], [x8], x5 + dup s18, v2.s[1] + stp s2, s18, [x7] + add x7, x7, x5 + st1 {v2.s}[2], [x8], x5 + dup s19, v3.s[1] + stp s3, s19, [x7] + add x7, x7, x5 + st1 {v3.s}[2], [x8], x5 + b WriteEndx4 + Write4x4: + st1 {v0.4s}, [x7], x5 + st1 {v1.4s}, [x7], x5 + st1 {v2.4s}, [x7], x5 + st1 {v3.4s}, [x7], x5 + + WriteEndx4: + subs x6, x6, #4 + beq LoopOcEnd + cmp x6, #4 + blt Loop1 + b Loop4 + + Loop1: + ld1 {v0.4s}, [x1], #16 + fadd v0.4s, v0.4s, v4.4s + fmax v0.4s, v0.4s, v5.4s + + cmp x3, #4 + bge Write4 + cmp x3, #3 + beq Write3 + cmp x3, #2 + beq Write2 + + Write1: + str s0, [x7] + add x7, x7, x5 + b WriteEnd + Write2: + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + b WriteEnd + Write3: + add x8, x7, #8 + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + st1 {v0.s}[2], [x8], x5 + b WriteEnd + Write4: + st1 {v0.4s}, [x7], x5 + WriteEnd: + subs x6, x6, #1 + bne Loop1 + LoopOcEnd: + subs x3, x3, #4 + add x0, x0, #16 + bgt LoopOc + + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu6.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu6.S new file mode 100644 index 000000000..b8c8b8484 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu6.S @@ -0,0 +1,146 @@ +#ifdef __aarch64__ + + .text + .align 5 + //.p2align 5,,15 + .global C4BiasAddRelu6 +#ifndef __APPLE__ + .type C4BiasAddRelu6, %function +#endif + +//void C4BiC4BiasAddRelu6asAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) +//x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride + +C4BiasAddRelu6: + dup v5.4s, wzr + movi v6.4s, #6 + scvtf v6.4s, v6.4s + + LoopOc: + ld1 {v4.4s}, [x2], #16 + mov x6, x4 + mov x7, x0 + cmp x6, #4 + blt Loop1 + + Loop4: + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 + fadd v0.4s, v0.4s, v4.4s + fadd v1.4s, v1.4s, v4.4s + fadd v2.4s, v2.4s, v4.4s + fadd v3.4s, v3.4s, v4.4s + + fmax v0.4s, v0.4s, v5.4s + fmax v1.4s, v1.4s, v5.4s + fmax v2.4s, v2.4s, v5.4s + fmax v3.4s, v3.4s, v5.4s + + fmin v0.4s, v0.4s, v6.4s + fmin v1.4s, v1.4s, v6.4s + fmin v2.4s, v2.4s, v6.4s + fmin v3.4s, v3.4s, v6.4s + + cmp x3, #4 + bge Write4x4 + cmp x3, #3 + beq Write3x4 + cmp x3, #2 + beq Write2x4 + + Write1x4: + str s0, [x7] + add x7, x7, x5 + str s1, [x7] + add x7, x7, x5 + str s2, [x7] + add x7, x7, x5 + str s3, [x7] + add x7, x7, x5 + b WriteEndx4 + Write2x4: + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + dup s17, v1.s[1] + stp s1, s17, [x7] + add x7, x7, x5 + dup s18, v2.s[1] + stp s2, s18, [x7] + add x7, x7, x5 + dup s19, v3.s[1] + stp s3, s19, [x7] + add x7, x7, x5 + b WriteEndx4 + Write3x4: + add x8, x7, #8 + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + st1 {v0.s}[2], [x8], x5 + dup s17, v1.s[1] + stp s1, s17, [x7] + add x7, x7, x5 + st1 {v1.s}[2], [x8], x5 + dup s18, v2.s[1] + stp s2, s18, [x7] + add x7, x7, x5 + st1 {v2.s}[2], [x8], x5 + dup s19, v3.s[1] + stp s3, s19, [x7] + add x7, x7, x5 + st1 {v3.s}[2], [x8], x5 + b WriteEndx4 + Write4x4: + st1 {v0.4s}, [x7], x5 + st1 {v1.4s}, [x7], x5 + st1 {v2.4s}, [x7], x5 + st1 {v3.4s}, [x7], x5 + + WriteEndx4: + subs x6, x6, #4 + beq LoopOcEnd + cmp x6, #4 + blt Loop1 + b Loop4 + + Loop1: + ld1 {v0.4s}, [x1], #16 + fadd v0.4s, v0.4s, v4.4s + fmax v0.4s, v0.4s, v5.4s + fmin v0.4s, v0.4s, v6.4s + + cmp x3, #4 + bge Write4 + cmp x3, #3 + beq Write3 + cmp x3, #2 + beq Write2 + + Write1: + str s0, [x7] + add x7, x7, x5 + b WriteEnd + Write2: + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + b WriteEnd + Write3: + add x8, x7, #8 + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x5 + st1 {v0.s}[2], [x8], x5 + b WriteEnd + Write4: + st1 {v0.4s}, [x7], x5 + WriteEnd: + subs x6, x6, #1 + bne Loop1 + LoopOcEnd: + subs x3, x3, #4 + add x0, x0, #16 + bgt LoopOc + + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu.S new file mode 100644 index 000000000..70ebb2f29 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu.S @@ -0,0 +1,132 @@ +#ifdef __aarch64__ + + .text + .align 5 + //.p2align 5,,15 + .global C4Relu +#ifndef __APPLE__ + .type C4Relu, %function +#endif + +//void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride) +//x0: dst, x1: input, x2: oc, x3: plane_size, x4: stride + +C4Relu: + dup v5.4s, wzr + LoopOc: + mov x6, x3 + mov x7, x0 + cmp x6, #4 + blt Loop1 + + Loop4: + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 + + fmax v0.4s, v0.4s, v5.4s + fmax v1.4s, v1.4s, v5.4s + fmax v2.4s, v2.4s, v5.4s + fmax v3.4s, v3.4s, v5.4s + + cmp x2, #4 + bge Write4x4 + cmp x2, #3 + beq Write3x4 + cmp x2, #2 + beq Write2x4 + + Write1x4: + str s0, [x7] + add x7, x7, x4 + str s1, [x7] + add x7, x7, x4 + str s2, [x7] + add x7, x7, x4 + str s3, [x7] + add x7, x7, x4 + b WriteEndx4 + Write2x4: + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x4 + dup s17, v1.s[1] + stp s1, s17, [x7] + add x7, x7, x4 + dup s18, v2.s[1] + stp s2, s18, [x7] + add x7, x7, x4 + dup s19, v3.s[1] + stp s3, s19, [x7] + add x7, x7, x4 + b WriteEndx4 + Write3x4: + add x8, x7, #8 + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x4 + st1 {v0.s}[2], [x8], x4 + dup s17, v1.s[1] + stp s1, s17, [x7] + add x7, x7, x4 + st1 {v1.s}[2], [x8], x4 + dup s18, v2.s[1] + stp s2, s18, [x7] + add x7, x7, x4 + st1 {v2.s}[2], [x8], x4 + dup s19, v3.s[1] + stp s3, s19, [x7] + add x7, x7, x4 + st1 {v3.s}[2], [x8], x4 + b WriteEndx4 + Write4x4: + st1 {v0.4s}, [x7], x4 + st1 {v1.4s}, [x7], x4 + st1 {v2.4s}, [x7], x4 + st1 {v3.4s}, [x7], x4 + + WriteEndx4: + subs x6, x6, #4 + beq LoopOcEnd + cmp x6, #4 + blt Loop1 + b Loop4 + + Loop1: + ld1 {v0.4s}, [x1], #16 + fadd v0.4s, v0.4s, v4.4s + fmax v0.4s, v0.4s, v5.4s + + cmp x2, #4 + bge Write4 + cmp x2, #3 + beq Write3 + cmp x2, #2 + beq Write2 + + Write1: + str s0, [x7] + add x7, x7, x4 + b WriteEnd + Write2: + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x4 + b WriteEnd + Write3: + add x8, x7, #8 + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x4 + st1 {v0.s}[2], [x8], x4 + b WriteEnd + Write4: + st1 {v0.4s}, [x7], x4 + WriteEnd: + subs x6, x6, #1 + bne Loop1 + LoopOcEnd: + subs x2, x2, #4 + add x0, x0, #16 + bgt LoopOc + + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu6.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu6.S new file mode 100644 index 000000000..d19c22a6d --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu6.S @@ -0,0 +1,140 @@ +#ifdef __aarch64__ + + .text + .align 5 + //.p2align 5,,15 + .global C4Relu6 +#ifndef __APPLE__ + .type C4Relu6, %function +#endif + +//void C4Relu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) +//x0: dst, x1: input, x2: oc, x2: plane_size, x3: stride + +C4Relu6: + dup v5.4s, wzr + movi v6.4s, #6 + scvtf v6.4s, v6.4s + + LoopOc: + mov x6, x3 + mov x7, x0 + cmp x6, #4 + blt Loop1 + + Loop4: + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 + fmax v0.4s, v0.4s, v5.4s + fmax v1.4s, v1.4s, v5.4s + fmax v2.4s, v2.4s, v5.4s + fmax v3.4s, v3.4s, v5.4s + + fmin v0.4s, v0.4s, v6.4s + fmin v1.4s, v1.4s, v6.4s + fmin v2.4s, v2.4s, v6.4s + fmin v3.4s, v3.4s, v6.4s + + cmp x2, #4 + bge Write4x4 + cmp x2, #3 + beq Write3x4 + cmp x2, #2 + beq Write2x4 + + Write1x4: + str s0, [x7] + add x7, x7, x4 + str s1, [x7] + add x7, x7, x4 + str s2, [x7] + add x7, x7, x4 + str s3, [x7] + add x7, x7, x4 + b WriteEndx4 + Write2x4: + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x4 + dup s17, v1.s[1] + stp s1, s17, [x7] + add x7, x7, x4 + dup s18, v2.s[1] + stp s2, s18, [x7] + add x7, x7, x4 + dup s19, v3.s[1] + stp s3, s19, [x7] + add x7, x7, x4 + b WriteEndx4 + Write3x4: + add x8, x7, #8 + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x4 + st1 {v0.s}[2], [x8], x4 + dup s17, v1.s[1] + stp s1, s17, [x7] + add x7, x7, x4 + st1 {v1.s}[2], [x8], x4 + dup s18, v2.s[1] + stp s2, s18, [x7] + add x7, x7, x4 + st1 {v2.s}[2], [x8], x4 + dup s19, v3.s[1] + stp s3, s19, [x7] + add x7, x7, x4 + st1 {v3.s}[2], [x8], x4 + b WriteEndx4 + Write4x4: + st1 {v0.4s}, [x7], x4 + st1 {v1.4s}, [x7], x4 + st1 {v2.4s}, [x7], x4 + st1 {v3.4s}, [x7], x4 + + WriteEndx4: + subs x6, x6, #4 + beq LoopOcEnd + cmp x6, #4 + blt Loop1 + b Loop4 + + Loop1: + ld1 {v0.4s}, [x1], #16 + fadd v0.4s, v0.4s, v4.4s + fmax v0.4s, v0.4s, v5.4s + fmin v0.4s, v0.4s, v6.4s + + cmp x2, #4 + bge Write4 + cmp x2, #3 + beq Write3 + cmp x2, #2 + beq Write2 + + Write1: + str s0, [x7] + add x7, x7, x4 + b WriteEnd + Write2: + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x4 + b WriteEnd + Write3: + add x8, x7, #8 + dup s16, v0.s[1] + stp s0, s16, [x7] + add x7, x7, x4 + st1 {v0.s}[2], [x8], x4 + b WriteEnd + Write4: + st1 {v0.4s}, [x7], x4 + WriteEnd: + subs x6, x6, #1 + bne Loop1 + LoopOcEnd: + subs x2, x2, #4 + add x0, x0, #16 + bgt LoopOc + + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S new file mode 100644 index 000000000..0252d28e3 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S @@ -0,0 +1,96 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global ConvDwFp32Center +#ifndef __APPLE__ +.type ConvDwFp32Center, %function +#endif + +// void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, +// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, +// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); +// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, +// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step +// x14: relu, x15: relu6 +ConvDwFp32Center: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + sub sp, sp, #48 + stp x19, x20, [sp], #16 + stp x21, x22, [sp], #16 + stp x23, x24, [sp], #16 + + ldr x8, [sp] + ldr x9, [sp, #8] + ldr x10, [sp, #16] + ldr x11, [sp, #24] + ldr x12, [sp, #32] + ldr x13, [sp, #40] + ldr x14, [sp, #48] + ldr x15, [sp, #56] + + mov x16, #4 + mul x8, x8, x16 + mul x9, x9, x16 + mul x10, x10, x16 + mul x11, x11, x16 + mul x12, x12, x16 + mul x13, x13, x16 + mov x16, #16 + mul x19, x7, x16 + + ld1 {v5.4s}, [x3] + + LoopH: + mov x23, x1 + mov x24, x5 + mov x3, x0 + LoopW: + mov x16, x23 + mov x17, x2 + mov x20, x6 + ld1 {v0.4s}, [x3] + fadd v0.4s, v0.4s, v5.4s + LoopKh: + mov x18, x7 + mov x21, x17 + mov x22, x16 + LoopKw: + ld1 {v1.4s}, [x22], x13 + ld1 {v2.4s}, [x21], #16 + fmla v0.4s, v1.4s, v2.4s + subs x18, x18, #1 + bne LoopKw + add x16, x16, x12 + add x17, x17, x19 + subs x20, x20, #1 + bne LoopKh + cbnz x15, Relu6 + cbnz x14, Relu + b Write + Relu6: + movi v4.4s, #6 + scvtf v4.4s, v4.4s + fmin v0.4s, v0.4s, v4.4s + Relu: + dup v3.4s, wzr + fmax v0.4s, v0.4s, v3.4s + Write: + st1 {v0.4s}, [x3], x9 + add x23, x23, x11 + subs x24, x24, #1 + bne LoopW + add x0, x0, x8 + add x1, x1, x10 + subs x4, x4, #1 + bne LoopH + + sub sp, sp, #48 + ldp x19, x20, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x23, x24, [sp], #16 + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S new file mode 100644 index 000000000..f23a10029 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S @@ -0,0 +1,77 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global DeconvDwFp32Center +#ifndef __APPLE__ +.type DeconvDwFp32Center, %function +#endif + +// void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, +// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, +// size_t in_kh_step, size_t in_kw_step); +// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step +// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step +DeconvDwFp32Center: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + sub sp, sp, #32 + stp x19, x20, [sp], #16 + stp x21, x22, [sp], #16 + + ldr x8, [sp] + ldr x9, [sp, #8] + ldr x10, [sp, #16] + ldr x11, [sp, #24] + ldr x12, [sp, #32] + + mov x13, #4 + mul x7, x7, x13 + mul x8, x8, x13 + mul x9, x9, x13 + mul x10, x10, x13 + mul x11, x11, x13 + mul x12, x12, x13 + mov x13, #16 + mul x14, x6, x13 + + LoopH: + mov x15, x0 + mov x16, x1 + mov x17, x4 + LoopW: + mov x18, x15 + mov x19, x2 + mov x20, x5 + LoopKh: + mov x21, x18 + mov x22, x19 + mov x13, x6 + LoopKw: + ld1 {v0.4s}, [x21] + ld1 {v1.4s}, [x16] + ld1 {v2.4s}, [x22], #16 + fmla v0.4s, v1.4s, v2.4s + st1 {v0.4s}, [x21], x12 + subs x13, x13, #1 + bne LoopKw + add x18, x18, x11 + add x19, x19, x14 + subs x20, x20, #1 + bne LoopKh + add x15, x15, x10 + add x16, x16, x8 + subs x17, x17, #1 + bne LoopW + add x0, x0, x9 + add x1, x1, x7 + subs x3, x3, #1 + bne LoopH + + sub sp, sp, #32 + ldp x19, x20, [sp], #16 + ldp x21, x22, [sp], #16 + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.cc index ffa93e007..d5cf6c7ea 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.cc @@ -81,20 +81,19 @@ void PostConvFuncFp32(const float *c4_out_ptr, float *out_ptr, const float *bias } } #else - int oc4 = UP_DIV(output_channel, C4NUM); if (bias_ptr != nullptr) { if (is_relu) { - BiasAddRelu(bias_ptr, out_ptr, oc4, plane_size); + C4BiasAddRelu(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); } else if (is_relu6) { - BiasAddRelu6(bias_ptr, out_ptr, oc4, plane_size); + C4BiasAddRelu6(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); } else { - BiasAdd(bias_ptr, out_ptr, oc4, plane_size); + C4BiasAdd(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); } } else { if (is_relu) { - Relu(out_ptr, oc4 * plane_size); + C4Relu(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float)); } else if (is_relu6) { - Relu6(out_ptr, oc4 * plane_size); + C4Relu6(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float)); } else { // do nothing } diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h index 0611df88c..ff906f3f4 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h @@ -42,6 +42,17 @@ void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size); void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size); void Relu6(float *data, size_t element4); void Relu(float *data, size_t element4); +void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride); +void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride); +void C4BiasAddRelu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride); +void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); +void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); +void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, + size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, + size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); +void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, + size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, + size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); #endif #ifdef __cplusplus diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc index 3ae161f7b..97cba0972 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc @@ -15,6 +15,7 @@ */ #include "src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h" +#include "src/runtime/kernel/arm/opclib/fp32/common_func.h" #ifdef ENABLE_ARM64 #include #endif @@ -122,6 +123,10 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl void DepthwiseCenter(float *dst, const float *src, const float *weight, const float *bias, int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) { +#ifdef ENABLE_ARM64 + ConvDwFp32Center(dst, src, weight, bias, height, width, kernel_h, kernel_w, out_h_step, block_channel, + in_sh_step, in_sw_step, in_kh_step, in_kw_step, is_relu, is_relu6); +#else float *dst_h = dst; const float *src_h = src; for (int oh = 0; oh < height; oh++) { @@ -163,6 +168,7 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl dst_h += out_h_step; src_h += in_sh_step; } // dst_height loop +#endif } // conv depthwise fp32: sliding window @@ -262,6 +268,10 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step) { +#ifdef ENABLE_ARM64 + DeconvDwFp32Center(dst, src, weight, height, width, kernel_h, kernel_w, out_h_step, block_channel, + in_sh_step, in_sw_step, in_kh_step, in_kw_step); +#else float *dst_h = dst; const float *src_h = src; for (int oh = 0; oh < height; oh++) { @@ -297,6 +307,7 @@ void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, in dst_h += in_sh_step; src_h += out_h_step; } // dst_height loop +#endif } void DeconvDepthwisePostFunc(float *dst, const float *bias, int block_channel, const ConvParameter *conv_param) { -- GitLab