/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifdef __ARM_NEON__ #include namespace paddle_mobile { namespace operators { namespace math { #ifdef __aarch64__ void sgemm_12x8(const float *lhs, const float *rhs, const int k, float *output, const int ldc) { // TODO(hjchen2) } #else void sgemm_6x8(const float *lhs, const float *rhs, const int k, float *output, const int ldc) { int kc1 = k >> 3; // k / 8 int kc2 = k & 0x7; // k % 8 int step = sizeof(float) * ldc; asm volatile( "pld [%[lhs]] \n\t" "pld [%[lhs], #64] \n\t" "pld [%[rhs]] \n\t" "pld [%[rhs], #64] \n\t" "vmov.f32 q4, #0.0 \n\t" "vmov.f32 q5, #0.0 \n\t" "vmov.f32 q6, #0.0 \n\t" "vmov.f32 q7, #0.0 \n\t" "vmov.f32 q8, #0.0 \n\t" "vmov.f32 q9, #0.0 \n\t" "vmov.f32 q10, #0.0 \n\t" "vmov.f32 q11, #0.0 \n\t" "vmov.f32 q12, #0.0 \n\t" "vmov.f32 q13, #0.0 \n\t" "vmov.f32 q14, #0.0 \n\t" "vmov.f32 q15, #0.0 \n\t" "subs %[kc1], %[kc1], #1 \n\t" "blt 2f \n\t" "1: \n\t" "pld [%[lhs], #128] \n\t" "pld [%[rhs], #128] \n\t" "vld1.32 {d0-d2}, [%[lhs]]! \n\t" "vld1.32 {q2, q3}, [%[rhs]]! \n\t" "vmla.f32 q4, q2, d0[0] \n\t" "vmla.f32 q5, q3, d0[0] \n\t" "vmla.f32 q6, q2, d0[1] \n\t" "vmla.f32 q7, q3, d0[1] \n\t" "vmla.f32 q8, q2, d1[0] \n\t" "vmla.f32 q9, q3, d1[0] \n\t" "vmla.f32 q10, q2, d1[1] \n\t" "vmla.f32 q11, q3, d1[1] \n\t" "vmla.f32 q12, q2, d2[0] \n\t" "vmla.f32 q13, q3, d2[0] \n\t" "vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t" "vld1.32 {d0-d2}, [%[lhs]]! \n\t" "vld1.32 {q2, q3}, [%[rhs]]! \n\t" "vmla.f32 q4, q2, d0[0] \n\t" "vmla.f32 q5, q3, d0[0] \n\t" "vmla.f32 q6, q2, d0[1] \n\t" "vmla.f32 q7, q3, d0[1] \n\t" "vmla.f32 q8, q2, d1[0] \n\t" "vmla.f32 q9, q3, d1[0] \n\t" "vmla.f32 q10, q2, d1[1] \n\t" "vmla.f32 q11, q3, d1[1] \n\t" "vmla.f32 q12, q2, d2[0] \n\t" "vmla.f32 q13, q3, d2[0] \n\t" "vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t" "pld [%[lhs], #128] \n\t" "pld [%[rhs], #128] \n\t" "vld1.32 {d0-d2}, [%[lhs]]! \n\t" "vld1.32 {q2, q3}, [%[rhs]]! \n\t" "vmla.f32 q4, q2, d0[0] \n\t" "vmla.f32 q5, q3, d0[0] \n\t" "vmla.f32 q6, q2, d0[1] \n\t" "vmla.f32 q7, q3, d0[1] \n\t" "vmla.f32 q8, q2, d1[0] \n\t" "vmla.f32 q9, q3, d1[0] \n\t" "vmla.f32 q10, q2, d1[1] \n\t" "vmla.f32 q11, q3, d1[1] \n\t" "vmla.f32 q12, q2, d2[0] \n\t" "vmla.f32 q13, q3, d2[0] \n\t" "vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t" "vld1.32 {d0-d2}, [%[lhs]]! \n\t" "vld1.32 {q2, q3}, [%[rhs]]! \n\t" "vmla.f32 q4, q2, d0[0] \n\t" "vmla.f32 q5, q3, d0[0] \n\t" "vmla.f32 q6, q2, d0[1] \n\t" "vmla.f32 q7, q3, d0[1] \n\t" "vmla.f32 q8, q2, d1[0] \n\t" "vmla.f32 q9, q3, d1[0] \n\t" "vmla.f32 q10, q2, d1[1] \n\t" "vmla.f32 q11, q3, d1[1] \n\t" "vmla.f32 q12, q2, d2[0] \n\t" "vmla.f32 q13, q3, d2[0] \n\t" "vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t" "pld [%[lhs], #128] \n\t" "pld [%[rhs], #128] \n\t" "vld1.32 {d0-d2}, [%[lhs]]! \n\t" "vld1.32 {q2, q3}, [%[rhs]]! \n\t" "vmla.f32 q4, q2, d0[0] \n\t" "vmla.f32 q5, q3, d0[0] \n\t" "vmla.f32 q6, q2, d0[1] \n\t" "vmla.f32 q7, q3, d0[1] \n\t" "vmla.f32 q8, q2, d1[0] \n\t" "vmla.f32 q9, q3, d1[0] \n\t" "vmla.f32 q10, q2, d1[1] \n\t" "vmla.f32 q11, q3, d1[1] \n\t" "vmla.f32 q12, q2, d2[0] \n\t" "vmla.f32 q13, q3, d2[0] \n\t" "vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t" "vld1.32 {d0-d2}, [%[lhs]]! \n\t" "vld1.32 {q2, q3}, [%[rhs]]! \n\t" "vmla.f32 q4, q2, d0[0] \n\t" "vmla.f32 q5, q3, d0[0] \n\t" "vmla.f32 q6, q2, d0[1] \n\t" "vmla.f32 q7, q3, d0[1] \n\t" "vmla.f32 q8, q2, d1[0] \n\t" "vmla.f32 q9, q3, d1[0] \n\t" "vmla.f32 q10, q2, d1[1] \n\t" "vmla.f32 q11, q3, d1[1] \n\t" "vmla.f32 q12, q2, d2[0] \n\t" "vmla.f32 q13, q3, d2[0] \n\t" "vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t" "pld [%[lhs], #128] \n\t" "pld [%[rhs], #128] \n\t" "vld1.32 {d0-d2}, [%[lhs]]! \n\t" "vld1.32 {q2, q3}, [%[rhs]]! \n\t" "vmla.f32 q4, q2, d0[0] \n\t" "vmla.f32 q5, q3, d0[0] \n\t" "vmla.f32 q6, q2, d0[1] \n\t" "vmla.f32 q7, q3, d0[1] \n\t" "vmla.f32 q8, q2, d1[0] \n\t" "vmla.f32 q9, q3, d1[0] \n\t" "vmla.f32 q10, q2, d1[1] \n\t" "vmla.f32 q11, q3, d1[1] \n\t" "vmla.f32 q12, q2, d2[0] \n\t" "vmla.f32 q13, q3, d2[0] \n\t" "vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t" "vld1.32 {d0-d2}, [%[lhs]]! \n\t" "vld1.32 {q2, q3}, [%[rhs]]! \n\t" "vmla.f32 q4, q2, d0[0] \n\t" "vmla.f32 q5, q3, d0[0] \n\t" "vmla.f32 q6, q2, d0[1] \n\t" "vmla.f32 q7, q3, d0[1] \n\t" "vmla.f32 q8, q2, d1[0] \n\t" "vmla.f32 q9, q3, d1[0] \n\t" "vmla.f32 q10, q2, d1[1] \n\t" "vmla.f32 q11, q3, d1[1] \n\t" "vmla.f32 q12, q2, d2[0] \n\t" "vmla.f32 q13, q3, d2[0] \n\t" "vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t" "subs %[kc1], %[kc1], #1 \n\t" "bge 1b \n\t" "2: \n\t" "subs %[kc2], %[kc2], #1 \n\t" "blt 4f \n\t" "3: \n\t" "vld1.32 {d0-d2}, [%[lhs]]! \n\t" "vld1.32 {q2, q3}, [%[rhs]]! \n\t" "vmla.f32 q4, q2, d0[0] \n\t" "vmla.f32 q5, q3, d0[0] \n\t" "vmla.f32 q6, q2, d0[1] \n\t" "vmla.f32 q7, q3, d0[1] \n\t" "vmla.f32 q8, q2, d1[0] \n\t" "vmla.f32 q9, q3, d1[0] \n\t" "vmla.f32 q10, q2, d1[1] \n\t" "vmla.f32 q11, q3, d1[1] \n\t" "vmla.f32 q12, q2, d2[0] \n\t" "vmla.f32 q13, q3, d2[0] \n\t" "vmla.f32 q14, q2, d2[1] \n\t" "vmla.f32 q15, q3, d2[1] \n\t" "subs %[kc2], %[kc2], #1 \n\t" "bge 3b \n\t" "4: \n\t" "mov r5, %[c] \n\t" "mov r6, %[step] \n\t" "vst1.32 {q4, q5}, [r5], r6 \n\t" "vst1.32 {q6, q7}, [r5], r6 \n\t" "vst1.32 {q8, q9}, [r5], r6 \n\t" "vst1.32 {q10, q11}, [r5], r6 \n\t" "vst1.32 {q12, q13}, [r5], r6 \n\t" "vst1.32 {q14, q15}, [r5] \n\t" : : [lhs] "r"(lhs), [rhs] "r"(rhs), [c] "r"(output), [kc1] "r"(kc1), [kc2] "r"(kc2), [step] "r"(step) : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } #endif // __aarch64__ } // namespace math } // namespace operators } // namespace paddle_mobile #endif // __ARM_NEON__