diff --git a/CMakeLists.txt b/CMakeLists.txt index ffbb8f68ad3efa2e9d767a5b73374c0727b9cd6f..097986546601ddf2f7f25e14c10ef4dc104c9e3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,6 @@ option(LOG_PROFILE "log profile" ON) option(CPU "armv7 with neon" ON) option(MALI_GPU "mali gpu" OFF) option(FPGA "fpga" OFF) -set(DEBUGING ON) if (ARM_LINUX) include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake") diff --git a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java index febe816681d3845a61c5a8b40630e82ac9b4ea95..6a6665dd334d1c7a47fea04ef708b84498f0e357 100755 --- a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java +++ b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java @@ -121,7 +121,14 @@ public class MainActivity extends Activity { String assetPath = "pml_demo"; String sdcardPath = Environment.getExternalStorageDirectory() + File.separator + assetPath + File.separator + type; - PML.load(sdcardPath); + //PML.load(sdcardPath); + String modelPath = Environment.getExternalStorageDirectory() + + File.separator + assetPath + + File.separator + "googlenet_combine" + File.separator + "model"; + String paramPath = Environment.getExternalStorageDirectory() + + File.separator + assetPath + + File.separator + "googlenet_combine" + File.separator + "params"; + PML.loadCombined(modelPath, paramPath); } }); diff --git a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java index 7649d4c081223bace01b806d1eb7dca57129ed7c..e67f04e47a77b28bfd8ce98866b1539797c217cd 100644 --- a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java +++ b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java @@ -8,6 +8,14 @@ public class PML { */ public static native boolean load(String modelPath); + /** + * Load + * @param modelPath + * @param paramPath + * @return + */ + public static native boolean loadCombined(String modelPath,String paramPath); + /** * object detection diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp index 01d4e52a4b1308a7ff97bc672d1a15d329dbf318..b14f095c1d82f167c1e3f15897b907e730a4a5a8 100644 --- a/src/jni/paddle_mobile_jni.cpp +++ b/src/jni/paddle_mobile_jni.cpp @@ -60,6 +60,15 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env, optimize); } +JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined( + JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) { + ANDROIDLOGI("load invoked"); + bool optimize = true; + return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath), + jstring2cppstring(env, paramPath), + optimize); +} + JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf) { jfloatArray result = NULL; diff --git a/src/jni/paddle_mobile_jni.h b/src/jni/paddle_mobile_jni.h index 86caa9a273ab11124f6ea67efe27dc3529cea69f..ab88816dcb7ec6ba88f12cb270812c4af0923b32 100644 --- a/src/jni/paddle_mobile_jni.h +++ b/src/jni/paddle_mobile_jni.h @@ -22,11 +22,16 @@ extern "C" { namespace paddle_mobile { namespace jni { /** - * load model & params of the net for android + * load separated model for android */ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env, jclass thiz, jstring modelPath); +/** + * load combined model for android + */ +JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined( + JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath); /** * object detection for anroid diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index e9974df967b293317c3014803bec27d2da73fca3..9582c18cbcfb6e502c42ab4195b553bd3b20093b 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -22,9 +22,14 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { namespace math { -alignas(64) float packedA[MC * KC]; -alignas(64) float packedB[KC * NC]; -alignas(64) float ab[MR * NR]; +int MC = 0; +int KC = 0; +int NC = 0; + +float *packedA; +float *packedB; +float *packedC; +float *zero; // 将A矩阵分块复制到连续内存(ColMajor) void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, float *buffer) { @@ -55,28 +60,39 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, // 将A矩阵分块复制到连续内存(RowMajor) void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, float *buffer) { - int i, j; - const float *Ai, *Ai1, *Ai2, *Ai3; - for (i = 0; i < m - m_tail; i += MR) { - Ai = &A(i, 0); - Ai1 = &A(i + 1, 0); - Ai2 = &A(i + 2, 0); - Ai3 = &A(i + 3, 0); + const float *a0, *a1, *a2, *a3; + for (int i = 0; i < m - m_tail; i += MR) { + a0 = A + i * lda; + a1 = A + (i + 1) * lda; + a2 = A + (i + 2) * lda; + a3 = A + (i + 3) * lda; for (int j = 0; j < k; ++j) { - *buffer++ = *Ai++; - *buffer++ = *Ai1++; - *buffer++ = *Ai2++; - *buffer++ = *Ai3++; + *buffer++ = *a0++; + *buffer++ = *a1++; + *buffer++ = *a2++; + *buffer++ = *a3++; } } + int i = m - m_tail; + a0 = &A(i, 0); + a1 = a0 + lda; + a2 = a0 + 2 * lda; + a3 = a0 + 3 * lda; if (m_tail != 0) { - for (j = 0; j < k; ++j) { - for (i = m - m_tail; i < m; ++i) { - *buffer++ = A(i, j); - } - for (i = m; i < m + (MR - m_tail); ++i) { - *buffer++ = 0; - } + if (m_tail <= 3) { + a3 = zero; + } + if (m_tail <= 2) { + a2 = zero; + } + if (m_tail <= 1) { + a1 = zero; + } + for (int j = 0; j < k; ++j) { + *buffer++ = *a0++; + *buffer++ = *a1++; + *buffer++ = *a2++; + *buffer++ = *a3++; } } } @@ -113,35 +129,24 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, // 将B矩阵分块复制到连续内存(RowMajor) void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, float *buffer) { - int i, j; - const float *Bij; - for (j = 0; j < n - n_tail; j += NR) { -#ifdef ARMV7 - - for (i = 0; i < k; ++i) { - Bij = &B(i, j); + const float *b0; + for (int j = 0; j < n - n_tail; j += NR) { + for (int i = 0; i < k; ++i) { + b0 = &B(i, j); asm volatile( - "vld1.32 {q0}, [%[Bij]] \n\t" - "vst1.32 {q0}, [%[buffer]]! \n\t" + "pld [%[b0]] \n\t" + "vld1.32 {q0, q1}, [%[b0]] \n\t" + "vst1.32 {q0, q1}, [%[buffer]]! \n\t" : [buffer] "+r"(buffer) - : [Bij] "r"(Bij) - : "memory", "q0"); - } -#else - for (i = 0; i < k; ++i) { - Bij = &B(i, j); - *buffer++ = *Bij; - *buffer++ = *(Bij + 1); - *buffer++ = *(Bij + 2); - *buffer++ = *(Bij + 3); + : [b0] "r"(b0) + : "memory", "q0", "q0"); } -#endif } if (n_tail != 0) { - for (i = 0; i < k; ++i) { - Bij = &B(i, n - n_tail); + for (int i = 0; i < k; ++i) { + b0 = &B(i, n - n_tail); for (int j = n - n_tail; j < n; ++j) { - *buffer++ = *Bij++; + *buffer++ = *b0++; } for (int j = n; j < n + (NR - n_tail); ++j) { *buffer++ = 0; @@ -151,118 +156,53 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, } // 分块矩阵乘法 -void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - int first_time) { - int m_block = (m + MR - 1) / MR * MR; - int n_block = (n + NR - 1) / NR * NR; - - int m_tail = m % MR; - int n_tail = n % NR; +void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, + float beta, float *c, float *C, int ldc, bool relu) { + for (int j = 0; j < nc; j += NR) { + for (int i = 0; i < mc; i += MR) { + // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + } + } - if (first_time) { - PackMatrixB_(k, n, n_tail, B, ldb, packedB); + if (alpha != 1) { + WriteWithAlphaBeta(mc, nc, c, C, ldc); + return; } - PackMatrixA_(m, k, m_tail, A, lda, packedA); - - int i, j, mc, nc; - - // B 取 4 列, 打包预热 - for (j = 0; j < n_block; j += NR) { - nc = (n - j) < NR ? n_tail : NR; - // A 取 4 行,打包预热 - for (i = 0; i < m_block; i += MR) { - mc = (m - i) < MR ? m_tail : MR; - AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta, - &C(i, j), ldc, mc, nc); - } + if (beta == 0) { + WriteBasic(mc, nc, c, C, ldc); + return; + } + if (beta == 1 && !relu) { + WriteWithAdd(mc, nc, c, C, ldc); + return; + } + if (beta == 1 && relu) { + WriteWithAddRelu(mc, nc, c, C, ldc); + return; } } // 分块矩阵乘法 -void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - int first_time, bool relu = false) { - int m_block = (m + MR - 1) / MR * MR; - int n_block = (n + NR - 1) / NR * NR; - - int m_tail = m % MR; - int n_tail = n % NR; - - if (first_time) { - PackMatrixB_(k, n, n_tail, B, ldb, packedB); - } - PackMatrixA_(m, k, m_tail, A, lda, packedA); - - int i, j, mc, nc; - - // B 取 4 列, 打包预热 - for (j = 0; j < n_block; j += NR) { - nc = (n - j) < NR ? n_tail : NR; - // A 取 4 行,打包预热 - for (i = 0; i < m_block; i += MR) { - mc = (m - i) < MR ? m_tail : MR; - AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta, - &C(i, j), ldc, mc, nc, relu); +void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, + const float *b, float beta, float *c, float *C, int ldc, + bool relu, float *new_scale, float *new_bias) { + for (int j = 0; j < nc; j += NR) { + for (int i = 0; i < mc; i += MR) { + // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); } } -} - -// 计算一个更小的 4 * 4 的 C 矩阵分块 -#if defined(IOS) -void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b, - int ldb, float beta, float *C, int ldc, int mc, int nc) { - // init C - float32x4_t cv0 = vdupq_n_f32(0.0); - float32x4_t cv1 = vdupq_n_f32(0.0); - float32x4_t cv2 = vdupq_n_f32(0.0); - float32x4_t cv3 = vdupq_n_f32(0.0); - - float32x4_t av; - float32x4_t bv; - - float32x2_t av01; - float32x2_t av23; - - for (int p = 0; p < k; p += 1) { - av = vld1q_f32(a); - bv = vld1q_f32(b); - - av01 = vget_low_f32(av); - cv0 = vmlaq_lane_f32(cv0, bv, av01, 0); - cv1 = vmlaq_lane_f32(cv1, bv, av01, 1); - av23 = vget_high_f32(av); - cv2 = vmlaq_lane_f32(cv2, bv, av23, 0); - cv3 = vmlaq_lane_f32(cv3, bv, av23, 1); - a += MR; - b += NR; - } - float32x4x4_t cv = {cv0, cv1, cv2, cv3}; - int i, j; - for (i = 0; i < mc; ++i) { - for (j = 0; j < nc; ++j) { - if (beta == 0.0) { - C(i, j) = 0.0; - } else if (beta != 1.0) { - C(i, j) *= beta; - } - if (j == 0) { - C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0); - } else if (j == 1) { - C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1); - } else if (j == 2) { - C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2); - } else if (j == 3) { - C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3); - } - } + if (relu) { + WriteWithBnRelu(mc, nc, c, C, ldc, new_scale, new_bias); + } else { + WriteWithBn(mc, nc, c, C, ldc, new_scale, new_bias); } } -void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, - int ldb, float beta, float *C, int ldc, int mc, int nc, - bool relu = false) { +#if defined(IOS) +void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) { // init C float32x4_t cv0 = vdupq_n_f32(0.0); float32x4_t cv1 = vdupq_n_f32(0.0); @@ -307,183 +247,22 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, } else if (j == 3) { C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3); } - if (C(i, j) < 0) { - C(i, j) = 0; - } } } } +} // namespace math #elif defined(ARMV7) -void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b, - int ldb, float beta, float *C, int ldc, int mc, int nc) { - int kc1 = k / 4, kc2 = k % 4; - int bytes_ldc = 4 * ldc; - int flag_alpha = (alpha == 1.0) ? 1 : 2; - int flag_beta; - if (beta == 0.0) { - flag_beta = 0; - } else if (beta == 1.0) { - flag_beta = 1; - } else { - flag_beta = 2; - } - asm volatile( - "pld [%[a]] \n\t" - "pld [%[b]] \n\t" - "vmov.f32 q10, #0.0 \n\t" - "vmov.f32 q11, #0.0 \n\t" - "vmov.f32 q12, #0.0 \n\t" - "vmov.f32 q13, #0.0 \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "blt end_kc1_%= \n\t" - "loop_kc1_%=: \n\t" - "pld [%[a], #64] \n\t" - "pld [%[b], #64] \n\t" - "vld1.32 {q0, q1}, [%[a]]! \n\t" - "vld1.32 {q2, q3}, [%[b]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - "vmla.f32 q11, q2, d0[1] \n\t" - "vmla.f32 q12, q2, d1[0] \n\t" - "vmla.f32 q13, q2, d1[1] \n\t" - "vmla.f32 q10, q3, d2[0] \n\t" - "vmla.f32 q11, q3, d2[1] \n\t" - "vmla.f32 q12, q3, d3[0] \n\t" - "vmla.f32 q13, q3, d3[1] \n\t" - "vld1.32 {q0, q1}, [%[a]]! \n\t" - "vld1.32 {q2, q3}, [%[b]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - "vmla.f32 q11, q2, d0[1] \n\t" - "vmla.f32 q12, q2, d1[0] \n\t" - "vmla.f32 q13, q2, d1[1] \n\t" - "vmla.f32 q10, q3, d2[0] \n\t" - "vmla.f32 q11, q3, d2[1] \n\t" - "vmla.f32 q12, q3, d3[0] \n\t" - "vmla.f32 q13, q3, d3[1] \n\t" - "subs %[kc1], %[kc1], #1 \n\t" - "bge loop_kc1_%= \n\t" - "end_kc1_%=: \n\t" - - "subs %[kc2], %[kc2], #1 \n\t" - "blt end_kc2_%= \n\t" - "loop_kc2_%=: \n\t" - "vld1.32 {q0}, [%[a]]! \n\t" - "vld1.32 {q1}, [%[b]]! \n\t" - "vmla.f32 q10, q1, d0[0] \n\t" - "vmla.f32 q11, q1, d0[1] \n\t" - "vmla.f32 q12, q1, d1[0] \n\t" - "vmla.f32 q13, q1, d1[1] \n\t" - "subs %[kc2], %[kc2], #1 \n\t" - "bge loop_kc2_%= \n\t" - "end_kc2_%=: \n\t" - - "cmp %[mc], #4 \n\t" - "bne temp_%= \n\t" - "cmp %[nc], #4 \n\t" - "bne temp_%= \n\t" - - "vmov.f32 d8[0], %[alpha] \n\t" - "vmov.f32 d8[1], %[beta] \n\t" - - "cmp %[flag_alpha], #1 \n\t" - "bne alpha_%= \n\t" - - "alpha_%=: \n\t" - "vmul.f32 q10, q10, d8[0] \n\t" - "vmul.f32 q11, q11, d8[0] \n\t" - "vmul.f32 q12, q12, d8[0] \n\t" - "vmul.f32 q13, q13, d8[0] \n\t" - - "beta_%=: \n\t" - "cmp %[flag_beta], #0 \n\t" - "beq memory_%= \n\t" - - "mov r4, %[C] \n\t" - "mov r6, %[bytes_ldc]\n\t" - "vld1.32 {q0}, [r4], r6 \n\t" - "vld1.32 {q1}, [r4], r6 \n\t" - "vld1.32 {q2}, [r4], r6 \n\t" - "vld1.32 {q3}, [r4] \n\t" - "cmp %[flag_beta], #1 \n\t" - "beq beta_eq1_%= \n\t" - "bne beta_ne1_%= \n\t" - - "beta_eq1_%=: \n\t" - "vadd.f32 q10, q10, q0 \n\t" - "vadd.f32 q11, q11, q1 \n\t" - "vadd.f32 q12, q12, q2 \n\t" - "vadd.f32 q13, q13, q3 \n\t" - "b memory_%= \n\t" - - "beta_ne1_%=: \n\t" - "vmla.f32 q10, q0, d8[1] \n\t" - "vmla.f32 q11, q1, d8[1] \n\t" - "vmla.f32 q12, q2, d8[1] \n\t" - "vmla.f32 q13, q3, d8[1] \n\t" - - "memory_%=: \n\t" - "mov r5, %[C] \n\t" - "mov r6, %[bytes_ldc]\n\t" - "vst1.32 {q10}, [r5], r6 \n\t" - "vst1.32 {q11}, [r5], r6 \n\t" - "vst1.32 {q12}, [r5], r6 \n\t" - "vst1.32 {q13}, [r5] \n\t" - "b end_%= \n\t" - - "temp_%=: \n\t" - "vst1.32 {q10, q11}, [%[ab]]!\n\t" - "vst1.32 {q12, q13}, [%[ab]] \n\t" - "end_%=: \n\t" - : - : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1), - [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha), - [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc), - [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta) - : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13"); - - if (mc != MR || nc != NR) { - int i, j; - for (i = 0; i < mc; ++i) { - for (j = 0; j < nc; ++j) { - if (beta == 0.0) { - if (alpha != 1.0) { - C(i, j) = alpha * ab[i * MR + j]; - } else { - C(i, j) = ab[i * MR + j]; - } - } else { - if (beta != 1.0) { - C(i, j) *= beta; - } - if (alpha != 1.0) { - C(i, j) += alpha * ab[i * MR + j]; - } else { - C(i, j) += ab[i * MR + j]; - } - } - } - } - } -} - -void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, - int ldb, float beta, float *C, int ldc, int mc, int nc, - bool relu = false) { - int kc1 = k / 4, kc2 = k % 4; - int bytes_ldc = 4 * ldc; - int flag_alpha = (alpha == 1.0) ? 1 : 2; - int flag_beta; - if (beta == 0.0) { - flag_beta = 0; - } else if (beta == 1.0) { - flag_beta = 1; - } else { - flag_beta = 2; - } +void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { + const float *a_ptr, *b_ptr; + a_ptr = a; + b_ptr = b; + int kc1 = k / 4; + int kc2 = k % 4; + int step = 4 * ldc; asm volatile( - "pld [%[a]] \n\t" - "pld [%[b]] \n\t" + "pld [%[a_ptr]] \n\t" + "pld [%[b_ptr]] \n\t" "vmov.f32 q10, #0.0 \n\t" "vmov.f32 q11, #0.0 \n\t" "vmov.f32 q12, #0.0 \n\t" @@ -492,20 +271,10 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, "subs %[kc1], %[kc1], #1 \n\t" "blt end_kc1_%= \n\t" "loop_kc1_%=: \n\t" - "pld [%[a], #64] \n\t" - "pld [%[b], #64] \n\t" - "vld1.32 {q0, q1}, [%[a]]! \n\t" - "vld1.32 {q2, q3}, [%[b]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - "vmla.f32 q11, q2, d0[1] \n\t" - "vmla.f32 q12, q2, d1[0] \n\t" - "vmla.f32 q13, q2, d1[1] \n\t" - "vmla.f32 q10, q3, d2[0] \n\t" - "vmla.f32 q11, q3, d2[1] \n\t" - "vmla.f32 q12, q3, d3[0] \n\t" - "vmla.f32 q13, q3, d3[1] \n\t" - "vld1.32 {q0, q1}, [%[a]]! \n\t" - "vld1.32 {q2, q3}, [%[b]]! \n\t" + "pld [%[a_ptr], #64] \n\t" + "pld [%[b_ptr], #64] \n\t" + "vld1.32 {q0, q1}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" "vmla.f32 q10, q2, d0[0] \n\t" "vmla.f32 q11, q2, d0[1] \n\t" "vmla.f32 q12, q2, d1[0] \n\t" @@ -514,6 +283,16 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, "vmla.f32 q11, q3, d2[1] \n\t" "vmla.f32 q12, q3, d3[0] \n\t" "vmla.f32 q13, q3, d3[1] \n\t" + "vld1.32 {q4, q5}, [%[a_ptr]]! \n\t" + "vld1.32 {q6, q7}, [%[b_ptr]]! \n\t" + "vmla.f32 q10, q6, d8[0] \n\t" + "vmla.f32 q11, q6, d8[1] \n\t" + "vmla.f32 q12, q6, d9[0] \n\t" + "vmla.f32 q13, q6, d9[1] \n\t" + "vmla.f32 q10, q7, d10[0] \n\t" + "vmla.f32 q11, q7, d10[1] \n\t" + "vmla.f32 q12, q7, d11[0] \n\t" + "vmla.f32 q13, q7, d11[1] \n\t" "subs %[kc1], %[kc1], #1 \n\t" "bge loop_kc1_%= \n\t" "end_kc1_%=: \n\t" @@ -521,8 +300,8 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, "subs %[kc2], %[kc2], #1 \n\t" "blt end_kc2_%= \n\t" "loop_kc2_%=: \n\t" - "vld1.32 {q0}, [%[a]]! \n\t" - "vld1.32 {q1}, [%[b]]! \n\t" + "vld1.32 {q0}, [%[a_ptr]]! \n\t" + "vld1.32 {q1}, [%[b_ptr]]! \n\t" "vmla.f32 q10, q1, d0[0] \n\t" "vmla.f32 q11, q1, d0[1] \n\t" "vmla.f32 q12, q1, d1[0] \n\t" @@ -531,290 +310,168 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, "bge loop_kc2_%= \n\t" "end_kc2_%=: \n\t" - "cmp %[mc], #4 \n\t" - "bne temp_%= \n\t" - "cmp %[nc], #4 \n\t" - "bne temp_%= \n\t" - - "vmov.f32 d8[0], %[alpha] \n\t" - "vmov.f32 d8[1], %[beta] \n\t" - - "cmp %[flag_alpha], #1 \n\t" - "bne alpha_%= \n\t" - - "alpha_%=: \n\t" - "vmul.f32 q10, q10, d8[0] \n\t" - "vmul.f32 q11, q11, d8[0] \n\t" - "vmul.f32 q12, q12, d8[0] \n\t" - "vmul.f32 q13, q13, d8[0] \n\t" - - "beta_%=: \n\t" - "cmp %[flag_beta], #0 \n\t" - "beq memory_%= \n\t" - - "mov r4, %[C] \n\t" - "mov r6, %[bytes_ldc]\n\t" - "vld1.32 {q0}, [r4], r6 \n\t" - "vld1.32 {q1}, [r4], r6 \n\t" - "vld1.32 {q2}, [r4], r6 \n\t" - "vld1.32 {q3}, [r4] \n\t" - "cmp %[flag_beta], #1 \n\t" - "beq beta_eq1_%= \n\t" - "bne beta_ne1_%= \n\t" - - "beta_eq1_%=: \n\t" - "vadd.f32 q10, q10, q0 \n\t" - "vadd.f32 q11, q11, q1 \n\t" - "vadd.f32 q12, q12, q2 \n\t" - "vadd.f32 q13, q13, q3 \n\t" - "b memory_%= \n\t" - - "beta_ne1_%=: \n\t" - "vmla.f32 q10, q0, d8[1] \n\t" - "vmla.f32 q11, q1, d8[1] \n\t" - "vmla.f32 q12, q2, d8[1] \n\t" - "vmla.f32 q13, q3, d8[1] \n\t" - - "memory_%=: \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vmax.f32 q11, q11, q14 \n\t" - "vmax.f32 q12, q12, q14 \n\t" - "vmax.f32 q13, q13, q14 \n\t" - "mov r5, %[C] \n\t" - "mov r6, %[bytes_ldc]\n\t" + "mov r5, %[c] \n\t" + "mov r6, %[step] \n\t" "vst1.32 {q10}, [r5], r6 \n\t" "vst1.32 {q11}, [r5], r6 \n\t" "vst1.32 {q12}, [r5], r6 \n\t" "vst1.32 {q13}, [r5] \n\t" - "b end_%= \n\t" - - "temp_%=: \n\t" - "vst1.32 {q10, q11}, [%[ab]]!\n\t" - "vst1.32 {q12, q13}, [%[ab]] \n\t" - "end_%=: \n\t" : - : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1), - [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha), - [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc), - [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta) - : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13", - "q14"); - - if (mc != MR || nc != NR) { - int i, j; - for (i = 0; i < mc; ++i) { - for (j = 0; j < nc; ++j) { - if (beta == 0.0) { - if (alpha != 1.0) { - C(i, j) = alpha * ab[i * MR + j]; - } else { - C(i, j) = ab[i * MR + j]; - } - } else { - if (beta != 1.0) { - C(i, j) *= beta; - } - if (alpha != 1.0) { - C(i, j) += alpha * ab[i * MR + j]; - } else { - C(i, j) += ab[i * MR + j]; - } - } - if (relu) { - if (C(i, j) < 0) { - C(i, j) = 0; - } - } - } - } - } + : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), + [kc2] "r"(kc2), [step] "r"(step) + : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q10", "q11", "q12", "q13"); } #else -void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b, - int ldb, float beta, float *C, int ldc, int mc, int nc) { - float c[16] = {0}; - float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3; - +void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { + float *c0, *c1, *c2, *c3; + c0 = c; + c1 = c + ldc; + c2 = c + 2 * ldc; + c3 = c + 3 * ldc; for (int p = 0; p < k; p += 1) { - reg_b0 = *b++; - reg_b1 = *b++; - reg_b2 = *b++; - reg_b3 = *b++; - - reg_a0 = *a++; - reg_a1 = *a++; - reg_a2 = *a++; - reg_a3 = *a++; - // first row - c[0] += reg_a0 * reg_b0; - c[1] += reg_a0 * reg_b1; - c[2] += reg_a0 * reg_b2; - c[3] += reg_a0 * reg_b3; + c0[0] += a[0] * b[0]; + c0[1] += a[0] * b[1]; + c0[2] += a[0] * b[2]; + c0[3] += a[0] * b[3]; // second row - c[4] += reg_a1 * reg_b0; - c[5] += reg_a1 * reg_b1; - c[6] += reg_a1 * reg_b2; - c[7] += reg_a1 * reg_b3; + c1[0] += a[1] * b[0]; + c1[1] += a[1] * b[1]; + c1[2] += a[1] * b[2]; + c1[3] += a[1] * b[3]; // third row - c[8] += reg_a2 * reg_b0; - c[9] += reg_a2 * reg_b1; - c[10] += reg_a2 * reg_b2; - c[11] += reg_a2 * reg_b3; + c2[0] += a[2] * b[0]; + c2[1] += a[2] * b[1]; + c2[2] += a[2] * b[2]; + c2[3] += a[2] * b[3]; // fourth row - c[12] += reg_a3 * reg_b0; - c[13] += reg_a3 * reg_b1; - c[14] += reg_a3 * reg_b2; - c[15] += reg_a3 * reg_b3; - } - int i, j; - for (i = 0; i < mc; ++i) { - for (j = 0; j < nc; ++j) { - if (beta == 0.0) { - C(i, j) = 0.0; - } else if (beta != 1.0) { - C(i, j) *= beta; - } - if (alpha != 1.0) { - C(i, j) += alpha * c[i * MR + j]; - } else { - C(i, j) += c[i * MR + j]; - } - } - } -} - -void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, - int ldb, float beta, float *C, int ldc, int mc, int nc, - bool relu) { - float c[16] = {0}; - float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3; - - for (int p = 0; p < k; p += 1) { - reg_b0 = *b++; - reg_b1 = *b++; - reg_b2 = *b++; - reg_b3 = *b++; - - reg_a0 = *a++; - reg_a1 = *a++; - reg_a2 = *a++; - reg_a3 = *a++; - - // first row - c[0] += reg_a0 * reg_b0; - c[1] += reg_a0 * reg_b1; - c[2] += reg_a0 * reg_b2; - c[3] += reg_a0 * reg_b3; - - // second row - c[4] += reg_a1 * reg_b0; - c[5] += reg_a1 * reg_b1; - c[6] += reg_a1 * reg_b2; - c[7] += reg_a1 * reg_b3; - - // third row - c[8] += reg_a2 * reg_b0; - c[9] += reg_a2 * reg_b1; - c[10] += reg_a2 * reg_b2; - c[11] += reg_a2 * reg_b3; + c3[0] += a[3] * b[0]; + c3[1] += a[3] * b[1]; + c3[2] += a[3] * b[2]; + c3[3] += a[3] * b[3]; - // fourth row - c[12] += reg_a3 * reg_b0; - c[13] += reg_a3 * reg_b1; - c[14] += reg_a3 * reg_b2; - c[15] += reg_a3 * reg_b3; - } - int i, j; - for (i = 0; i < mc; ++i) { - for (j = 0; j < nc; ++j) { - if (beta == 0.0) { - C(i, j) = 0.0; - } else if (beta != 1.0) { - C(i, j) *= beta; - } - if (alpha != 1.0) { - C(i, j) += alpha * c[i * MR + j]; - } else { - C(i, j) += c[i * MR + j]; - } - if (relu) { - if (C(i, j) < 0) { - C(i, j) = 0; - } - } - } + a += 4; + b += 4; } } #endif // 32位 float 矩阵乘法 -void sgemm(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc) { - int i, j, p, mc, nc, kc; - float beta_; - -#ifdef ARMV7 - if (m == 1) { - VectorKernel(1, n, k, alpha, A, lda, B, ldb, beta, C, ldc); - return; +void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc, bool relu) { + // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) + // L2 cache is 0.5~4 Mib (Contex-A72 cluster) + int L1 = 30 * 1024; + int L2 = 1 * 1024 * 1024; + + KC = k; + MC = L2 / (2 * KC * sizeof(float)); + NC = MC; + + // make sure MC is multiple of 4, and NC is multiple of 8 + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + 4 - 1) / 4 * 4; + // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; + + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + 8 - 1) / 8 * 8; + // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; + + packedA = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); + packedB = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); + packedC = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); + zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); + + for (int l = 0; l < KC; ++l) { + zero[l] = 0; } -#endif - for (j = 0; j < n; j += NC) { + int mc, nc; + for (int j = 0; j < n; j += NC) { nc = s_min(n - j, NC); - for (p = 0; p < k; p += KC) { - kc = s_min(k - p, KC); - for (i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); - if (p != 0) { - beta_ = 1.0; - } else { - beta_ = beta; - } - InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_, - &C(i, j), ldc, i == 0); - } + PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); + for (int i = 0; i < m; i += MC) { + mc = s_min(m - i, MC); + PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); + InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc, + relu); } } + + paddle_mobile::memory::Free(packedA); + paddle_mobile::memory::Free(packedB); + paddle_mobile::memory::Free(packedC); + paddle_mobile::memory::Free(zero); } -void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc) { - int i, j, p, mc, nc, kc; - float beta_; - for (j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); - for (p = 0; p < k; p += KC) { - kc = s_min(k - p, KC); - for (i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); - if (p != 0) { - beta_ = 1.0; - } else { - beta_ = beta; - } +void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc, + bool relu, float *new_scale, float *new_bias) { + // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) + // L2 cache is 0.5~4 Mib (Contex-A72 cluster) + int L1 = 30 * 1024; + int L2 = 1 * 1024 * 1024; + + KC = k; + MC = L2 / (2 * KC * sizeof(float)); + NC = MC; + + // make sure MC is multiple of 4, and NC is multiple of 8 + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + 4 - 1) / 4 * 4; + // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; + + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + 8 - 1) / 8 * 8; + // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; + + packedA = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); + packedB = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); + packedC = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); + zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); + + for (int l = 0; l < KC; ++l) { + zero[l] = 0; + } - if (p + KC >= k) { - InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, - beta_, &C(i, j), ldc, i == 0, true); - } else { - InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_, - &C(i, j), ldc, i == 0); - } - } + int mc, nc; + for (int j = 0; j < n; j += NC) { + nc = s_min(n - j, NC); + PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); + for (int i = 0; i < m; i += MC) { + mc = s_min(m - i, MC); + PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); + InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, + &C(i, j), ldc, relu, new_scale + ldc * i + j, + new_bias + ldc * i + j); } } + + paddle_mobile::memory::Free(packedA); + paddle_mobile::memory::Free(packedB); + paddle_mobile::memory::Free(packedC); + paddle_mobile::memory::Free(zero); } -#ifdef ARMV7 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc) { + const float *B, int ldb, float beta, float *C, int ldc, + bool relu) { float *bufferC = static_cast(memory::Alloc(sizeof(float) * n)); const float *a0, *b0, *b1, *b2, *b3; @@ -1016,18 +673,995 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, } } - c0 = bufferC; - C0 = C; - for (int i = 0; i < n; i++) { - if (beta == 1.0) { - *C0++ += *c0++; - } else { - *C0++ = *c0++; - } + if (alpha != 1) { + VecWriteWithAlphaBeta(n, bufferC, C, ldc); + return; + } + if (beta == 0) { + VecWriteBasic(n, bufferC, C, ldc); + return; + } + if (beta == 1 && !relu) { + VecWriteWithAdd(n, bufferC, C, ldc); + return; + } + if (beta == 1 && relu) { + VecWriteWithAddRelu(n, bufferC, C, ldc); + return; } } -#endif -} // namespace math -} // namespace operators +void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, + int lda, const float *B, int ldb, float beta, float *C, + int ldc, bool relu, float *new_scale, float *new_bias) { + float *bufferC = static_cast(memory::Alloc(sizeof(float) * n)); + + const float *a0, *b0, *b1, *b2, *b3; + float *c0, *C0; + + int volatile kc1 = k / 4; + int volatile kc2 = k % 4; + int volatile nc1 = n / 16; + int _nc1 = n % 16; + int volatile nc2 = _nc1 / 4; + int volatile nc3 = _nc1 % 4; + for (int i = 0; i < kc1; i++) { + a0 = A + i * 4; + b0 = B + i * 4 * ldb; + b1 = b0 + ldb; + b2 = b1 + ldb; + b3 = b2 + ldb; + c0 = bufferC; + asm volatile( + "pld [%[a0], #16] \n\t" + "vld1.32 {q0}, [%[a0]] \n\t" + + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "cmp %[i], #0 \n\t" + "beq i_eq0_%= \n\t" + "bne i_ne0_%= \n\t" + + "i_eq0_%=: \n\t" + "vmov.f32 q10, #0.0 \n\t" + "vmov.f32 q11, #0.0 \n\t" + "vmov.f32 q12, #0.0 \n\t" + "vmov.f32 q13, #0.0 \n\t" + "b gemm_nc1_%= \n\t" + + "i_ne0_%=: \n\t" + "pld [%[c0], #64] \n\t" + "vld1.32 {q10, q11}, [%[c0]]! \n\t" + "vld1.32 {q12, q13}, [%[c0]] \n\t" + "sub %[c0], %[c0], #32 \n\t" + + "gemm_nc1_%=: \n\t" + "pld [%[b0], #64] \n\t" + "vld1.32 {q2, q3}, [%[b0]]! \n\t" + "vld1.32 {q4, q5}, [%[b0]]! \n\t" + "vmla.f32 q10, q2, d0[0] \n\t" + "vmla.f32 q11, q3, d0[0] \n\t" + "vmla.f32 q12, q4, d0[0] \n\t" + "vmla.f32 q13, q5, d0[0] \n\t" + + "pld [%[b1], #64] \n\t" + "vld1.32 {q2, q3}, [%[b1]]! \n\t" + "vld1.32 {q4, q5}, [%[b1]]! \n\t" + "vmla.f32 q10, q2, d0[1] \n\t" + "vmla.f32 q11, q3, d0[1] \n\t" + "vmla.f32 q12, q4, d0[1] \n\t" + "vmla.f32 q13, q5, d0[1] \n\t" + + "pld [%[b2], #64] \n\t" + "vld1.32 {q2, q3}, [%[b2]]! \n\t" + "vld1.32 {q4, q5}, [%[b2]]! \n\t" + "vmla.f32 q10, q2, d1[0] \n\t" + "vmla.f32 q11, q3, d1[0] \n\t" + "vmla.f32 q12, q4, d1[0] \n\t" + "vmla.f32 q13, q5, d1[0] \n\t" + + "pld [%[b3], #64] \n\t" + "vld1.32 {q2, q3}, [%[b3]]! \n\t" + "vld1.32 {q4, q5}, [%[b3]]! \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q4, d1[1] \n\t" + "vmla.f32 q13, q5, d1[1] \n\t" + + "vst1.32 {q10, q11}, [%[c0]]! \n\t" + "vst1.32 {q12, q13}, [%[c0]]! \n\t" + + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "blt end_nc2_%= \n\t" + "loop_nc2_%=: \n\t" + + "cmp %[i], #0 \n\t" + "beq ii_eq0_%= \n\t" + "bne ii_ne0_%= \n\t" + + "ii_eq0_%=: \n\t" + "vmov.f32 q10, #0.0 \n\t" + "b gemm_nc2_%= \n\t" + + "ii_ne0_%=: \n\t" + "pld [%[c0], #16] \n\t" + "vld1.32 {q10}, [%[c0]] \n\t" + + "gemm_nc2_%=: \n\t" + "pld [%[b0], #16] \n\t" + "vld1.32 {q2}, [%[b0]]! \n\t" + "vmla.f32 q10, q2, d0[0] \n\t" + + "pld [%[b1], #16] \n\t" + "vld1.32 {q3}, [%[b1]]! \n\t" + "vmla.f32 q10, q3, d0[1] \n\t" + + "pld [%[b2], #16] \n\t" + "vld1.32 {q4}, [%[b2]]! \n\t" + "vmla.f32 q10, q4, d1[0] \n\t" + + "pld [%[b3], #16] \n\t" + "vld1.32 {q5}, [%[b3]]! \n\t" + "vmla.f32 q10, q5, d1[1] \n\t" + + "vst1.32 {q10}, [%[c0]]! \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "bge loop_nc2_%= \n\t" + "end_nc2_%=: \n\t" + + : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3), + [c0] "+r"(c0) + : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2) + : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13"); + + for (int j = 0; j < nc3; j++) { + if (i == 0) { + *c0 = (*a0) * (*b0++); + } else { + *c0 += (*a0) * (*b0++); + } + *c0 += (*(a0 + 1)) * (*b1++); + *c0 += (*(a0 + 2)) * (*b2++); + *c0 += (*(a0 + 3)) * (*b3++); + c0++; + } + } + + for (int i = 0; i < kc2; ++i) { + a0 = A + 4 * kc1 + i; + b0 = B + (4 * kc1 + i) * ldb; + c0 = bufferC; + asm volatile( + "pld [%[a0], #16] \n\t" + "vld1.32 {d0}, [%[a0]] \n\t" + + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "pld [%[c0], #64] \n\t" + "vld1.32 {q10, q11}, [%[c0]]! \n\t" + "vld1.32 {q12, q13}, [%[c0]] \n\t" + "sub %[c0], %[c0], #32 \n\t" + + "gemm_nc1_%=: \n\t" + "pld [%[b0], #64] \n\t" + "vld1.32 {q2, q3}, [%[b0]]! \n\t" + "vld1.32 {q4, q5}, [%[b0]]! \n\t" + "vmla.f32 q10, q2, d0[0] \n\t" + "vmla.f32 q11, q3, d0[0] \n\t" + "vmla.f32 q12, q4, d0[0] \n\t" + "vmla.f32 q13, q5, d0[0] \n\t" + + "vst1.32 {q10, q11}, [%[c0]]! \n\t" + "vst1.32 {q12, q13}, [%[c0]]! \n\t" + + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "blt end_nc2_%= \n\t" + "loop_nc2_%=: \n\t" + + "pld [%[c0], #16] \n\t" + "vld1.32 {q10}, [%[c0]] \n\t" + + "gemm_nc2_%=: \n\t" + "vld1.32 {q2}, [%[b0]]! \n\t" + "vmla.f32 q10, q2, d0[0] \n\t" + + "vst1.32 {q10}, [%[c0]]! \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "bge loop_nc2_%= \n\t" + "end_nc2_%=: \n\t" + + : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3), + [c0] "+r"(c0) + : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2) + : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13"); + + for (int j = 0; j < nc3; j++) { + *c0 += (*a0) * (*b0++); + c0++; + } + } + + if (relu) { + VecWriteWithBnRelu(n, bufferC, C, ldc, new_scale, new_bias); + } else { + VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias); + } +} + +void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { + const float *a_ptr, *b_ptr; + a_ptr = a; + b_ptr = b; + int kc1 = k / 4; + int kc2 = k % 4; + int step = 4 * ldc; + asm volatile( + "pld [%[a_ptr]] \n\t" + "pld [%[b_ptr]] \n\t" + + "vmov.f32 q8, #0.0 \n\t" + "vmov.f32 q9, #0.0 \n\t" + "vmov.f32 q10, #0.0 \n\t" + "vmov.f32 q11, #0.0 \n\t" + "vmov.f32 q12, #0.0 \n\t" + "vmov.f32 q13, #0.0 \n\t" + "vmov.f32 q14, #0.0 \n\t" + "vmov.f32 q15, #0.0 \n\t" + + "subs %[kc1], %[kc1], #1 \n\t" + "blt end_kc1_%= \n\t" + "loop_kc1_%=: \n\t" + + "pld [%[a_ptr], #64] \n\t" + "pld [%[b_ptr], #64] \n\t" + + "vld1.32 {q0, q1}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + "vld1.32 {q4, q5}, [%[b_ptr]]! \n\t" + + "vmla.f32 q8, q2, d0[0] \n\t" + "vmla.f32 q9, q3, d0[0] \n\t" + "vmla.f32 q10, q2, d0[1] \n\t" + "vmla.f32 q11, q3, d0[1] \n\t" + "vmla.f32 q12, q2, d1[0] \n\t" + "vmla.f32 q13, q3, d1[0] \n\t" + "vmla.f32 q14, q2, d1[1] \n\t" + "vmla.f32 q15, q3, d1[1] \n\t" + + "vmla.f32 q8, q4, d2[0] \n\t" + "vmla.f32 q9, q5, d2[0] \n\t" + "vmla.f32 q10, q4, d2[1] \n\t" + "vmla.f32 q11, q5, d2[1] \n\t" + "vmla.f32 q12, q4, d3[0] \n\t" + "vmla.f32 q13, q5, d3[0] \n\t" + "vmla.f32 q14, q4, d3[1] \n\t" + "vmla.f32 q15, q5, d3[1] \n\t" + + "pld [%[b_ptr], #64] \n\t" + + "vld1.32 {q0, q1}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + "vld1.32 {q4, q5}, [%[b_ptr]]! \n\t" + + "vmla.f32 q8, q2, d0[0] \n\t" + "vmla.f32 q9, q3, d0[0] \n\t" + "vmla.f32 q10, q2, d0[1] \n\t" + "vmla.f32 q11, q3, d0[1] \n\t" + "vmla.f32 q12, q2, d1[0] \n\t" + "vmla.f32 q13, q3, d1[0] \n\t" + "vmla.f32 q14, q2, d1[1] \n\t" + "vmla.f32 q15, q3, d1[1] \n\t" + + "vmla.f32 q8, q4, d2[0] \n\t" + "vmla.f32 q9, q5, d2[0] \n\t" + "vmla.f32 q10, q4, d2[1] \n\t" + "vmla.f32 q11, q5, d2[1] \n\t" + "vmla.f32 q12, q4, d3[0] \n\t" + "vmla.f32 q13, q5, d3[0] \n\t" + "vmla.f32 q14, q4, d3[1] \n\t" + "vmla.f32 q15, q5, d3[1] \n\t" + + "subs %[kc1], %[kc1], #1 \n\t" + "bge loop_kc1_%= \n\t" + "end_kc1_%=: \n\t" + + "subs %[kc2], %[kc2], #1 \n\t" + "blt end_kc2_%= \n\t" + "loop_kc2_%=: \n\t" + "vld1.32 {q0}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + "vmla.f32 q8, q2, d0[0] \n\t" + "vmla.f32 q9, q3, d0[0] \n\t" + "vmla.f32 q10, q2, d0[1] \n\t" + "vmla.f32 q11, q3, d0[1] \n\t" + "vmla.f32 q12, q2, d1[0] \n\t" + "vmla.f32 q13, q3, d1[0] \n\t" + "vmla.f32 q14, q2, d1[1] \n\t" + "vmla.f32 q15, q3, d1[1] \n\t" + "subs %[kc2], %[kc2], #1 \n\t" + "bge loop_kc2_%= \n\t" + "end_kc2_%=: \n\t" + + "mov r5, %[c] \n\t" + "mov r6, %[step] \n\t" + "vst1.32 {q8, q9}, [r5], r6 \n\t" + "vst1.32 {q10, q11}, [r5], r6 \n\t" + "vst1.32 {q12, q13}, [r5], r6 \n\t" + "vst1.32 {q14, q15}, [r5] \n\t" + : + : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), + [kc2] "r"(kc2), [step] "r"(step) + : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +} + +// C = A * B +void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { + int nc1 = nc / 16; + int _nc1 = nc % 16; + int step = 4 * ldc; + int step1 = 4 * (NC - 16 * nc1); + int volatile m = mc; + + float *volatile c_ptr, *volatile C_ptr; + float *C0, *c0; + c_ptr = c; + C_ptr = C; + if (nc1 > 0) { + asm volatile( + "subs %[mc], %[mc], #1 \n\t" + "blt end_mc_%= \n\t" + "loop_mc_%=: \n\t" + + "mov r6, %[C_ptr] \n\t" + "mov r5, %[nc1] \n\t" + "subs r5, r5, #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" + "vst1.32 {q0, q1}, [r6]! \n\t" + + "vld1.32 {q2, q3}, [%[c_ptr]]! \n\t" + "vst1.32 {q2, q3}, [r6]! \n\t" + + "subs r5, r5, #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "add %[C_ptr], %[C_ptr], %[step] \n\t" + "add %[c_ptr], %[c_ptr], %[step1] \n\t" + "subs %[mc], %[mc], #1 \n\t" + "bge loop_mc_%= \n\t" + "end_mc_%=: \n\t" + + : + : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1), + [step] "r"(step), [step1] "r"(step1) + : "memory", "r5", "r6", "q0", "q1", "q2", "q3"); + } + + if (_nc1 != 0) { + for (int i = 0; i < mc; i++) { + C0 = C_ptr + nc1 * 16 + i * ldc; + c0 = c_ptr + nc1 * 16 + i * NC; + for (int j = 0; j < _nc1; j++) { + *C0++ = *c0++; + } + } + } +} + +// C = alpha * A * B + beta * C +void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} + +// C = A * B + C +void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { + int nc1 = nc / 16; + int _nc1 = nc % 16; + int step = 4 * ldc; + int step1 = 4 * (NC - 16 * nc1); + int volatile m = mc; + + float *volatile c_ptr, *volatile C_ptr; + float *C0, *c0; + c_ptr = c; + C_ptr = C; + if (nc1 > 0) { + asm volatile( + "subs %[mc], %[mc], #1 \n\t" + "blt end_mc_%= \n\t" + "loop_mc_%=: \n\t" + + "mov r6, %[C_ptr] \n\t" + "mov r5, %[nc1] \n\t" + "subs r5, r5, #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [r6] \n\t" + "vld1.32 {q2, q3}, [%[c_ptr]]! \n\t" + "vadd.f32 q10, q0, q2 \n\t" + "vadd.f32 q11, q1, q3 \n\t" + "vst1.32 {q10, q11}, [r6]! \n\t" + + "vld1.32 {q4, q5}, [r6] \n\t" + "vld1.32 {q6, q7}, [%[c_ptr]]! \n\t" + "vadd.f32 q12, q4, q6 \n\t" + "vadd.f32 q13, q5, q7 \n\t" + "vst1.32 {q12, q13}, [r6]! \n\t" + + "subs r5, r5, #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "add %[C_ptr], %[C_ptr], %[step] \n\t" + "add %[c_ptr], %[c_ptr], %[step1] \n\t" + "subs %[mc], %[mc], #1 \n\t" + "bge loop_mc_%= \n\t" + "end_mc_%=: \n\t" + + : + : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1), + [step] "r"(step), [step1] "r"(step1) + : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q10", "q11", "q12", "q13"); + } + + if (_nc1 != 0) { + for (int i = 0; i < mc; i++) { + C0 = C_ptr + nc1 * 16 + i * ldc; + c0 = c_ptr + nc1 * 16 + i * NC; + for (int j = 0; j < _nc1; j++) { + *C0++ += *c0++; + } + } + } +} + +// C = A * B + C, relu(C) +void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { + int nc1 = nc / 16; + int _nc1 = nc % 16; + int step = 4 * ldc; + int step1 = 4 * (NC - 16 * nc1); + int volatile m = mc; + + float *volatile c_ptr, *volatile C_ptr; + float *C0, *c0; + c_ptr = c; + C_ptr = C; + if (nc1 > 0) { + asm volatile( + "vmov.f32 q14, #0.0 \n\t" + "subs %[mc], %[mc], #1 \n\t" + "blt end_mc_%= \n\t" + "loop_mc_%=: \n\t" + + "mov r6, %[C_ptr] \n\t" + "mov r5, %[nc1] \n\t" + "subs r5, r5, #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [r6] \n\t" + "vld1.32 {q2, q3}, [%[c_ptr]]! \n\t" + "vadd.f32 q10, q0, q2 \n\t" + "vadd.f32 q11, q1, q3 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vmax.f32 q11, q11, q14 \n\t" + "vst1.32 {q10, q11}, [r6]! \n\t" + + "vld1.32 {q4, q5}, [r6] \n\t" + "vld1.32 {q6, q7}, [%[c_ptr]]! \n\t" + "vadd.f32 q12, q4, q6 \n\t" + "vadd.f32 q13, q5, q7 \n\t" + "vmax.f32 q12, q12, q14 \n\t" + "vmax.f32 q13, q13, q14 \n\t" + "vst1.32 {q12, q13}, [r6]! \n\t" + + "subs r5, r5, #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "add %[C_ptr], %[C_ptr], %[step] \n\t" + "add %[c_ptr], %[c_ptr], %[step1] \n\t" + "subs %[mc], %[mc], #1 \n\t" + "bge loop_mc_%= \n\t" + "end_mc_%=: \n\t" + + : + : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1), + [step] "r"(step), [step1] "r"(step1) + : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q10", "q11", "q12", "q13"); + } + + if (_nc1 != 0) { + for (int i = 0; i < mc; i++) { + C0 = C_ptr + nc1 * 16 + i * ldc; + c0 = c_ptr + nc1 * 16 + i * NC; + for (int j = 0; j < _nc1; j++) { + *C0 += *c0; + if (*C0 < 0) { + *C0 = 0; + } + C0++; + c0++; + } + } + } +} + +// C = A * B, batchnorm(C) +void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale, + float *bias) { + int nc1 = nc / 16; + int _nc1 = nc % 16; + int nc2 = _nc1 / 4; + int nc3 = 16 - 4 * (_nc1 % 4); + int step = 4 * (ldc - nc); + int step1 = 4 * (NC - nc); + + asm volatile( + "subs %[mc], %[mc], #1 \n\t" + "blt end_mc_%= \n\t" + "loop_mc_%=: \n\t" + + "mov r5, %[nc1] \n\t" + "mov r6, %[nc2] \n\t" + + "subs r5, r5, #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vld1.32 {q2, q3}, [%[scale]]! \n\t" + "vld1.32 {q10, q11}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q2 \n\t" + "vmla.f32 q11, q1, q3 \n\t" + "vst1.32 {q10, q11}, [%[C]]! \n\t" + + "vld1.32 {q4, q5}, [%[c]]! \n\t" + "vld1.32 {q6, q7}, [%[scale]]! \n\t" + "vld1.32 {q12, q13}, [%[bias]]! \n\t" + "vmla.f32 q12, q4, q6 \n\t" + "vmla.f32 q13, q5, q7 \n\t" + "vst1.32 {q12, q13}, [%[C]]! \n\t" + + "subs r5, r5, #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "subs r6, r6, #1 \n\t" + "blt end_nc2_%= \n\t" + "loop_nc2_%=: \n\t" + + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" + + "subs r6, r6, #1 \n\t" + "bge loop_nc2_%= \n\t" + "end_nc2_%=: \n\t" + + "cmp %[nc3], #16 \n\t" + "beq end_nc3_%= \n\t" + + "sub %[c], %[c], %[nc3] \n\t" + "sub %[scale], %[scale], %[nc3] \n\t" + "sub %[bias], %[bias], %[nc3] \n\t" + "sub %[C], %[C], %[nc3] \n\t" + + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" + "end_nc3_%=: \n\t" + + "add %[c], %[c], %[step1] \n\t" + "add %[scale], %[scale], %[step] \n\t" + "add %[bias], %[bias], %[step] \n\t" + "add %[C], %[C], %[step] \n\t" + + "subs %[mc], %[mc], #1 \n\t" + "bge loop_mc_%= \n\t" + "end_mc_%=: \n\t" + + : + : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2), + [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), + [scale] "r"(scale), [bias] "r"(bias) + : "memory", "cc", "r5", "r6", "r7", "r8", "q0", "q1", "q2", "q3", "q4", + "q5", "q6", "q7", "q10", "q11", "q12", "q13"); +} + +// C = A * B, batchnorm(C), relu(C) +void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale, + float *bias) { + int nc1 = nc / 16; + int _nc1 = nc % 16; + int nc2 = _nc1 / 4; + int nc3 = 16 - 4 * (_nc1 % 4); + int step = 4 * (ldc - nc); + int step1 = 4 * (NC - nc); + + asm volatile( + "vmov.f32 q14, #0.0 \n\t" + "subs %[mc], %[mc], #1 \n\t" + "blt end_mc_%= \n\t" + "loop_mc_%=: \n\t" + + "mov r5, %[nc1] \n\t" + "mov r6, %[nc2] \n\t" + + "subs r5, r5, #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vld1.32 {q2, q3}, [%[scale]]! \n\t" + "vld1.32 {q10, q11}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q2 \n\t" + "vmla.f32 q11, q1, q3 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vmax.f32 q11, q11, q14 \n\t" + "vst1.32 {q10, q11}, [%[C]]! \n\t" + + "vld1.32 {q4, q5}, [%[c]]! \n\t" + "vld1.32 {q6, q7}, [%[scale]]! \n\t" + "vld1.32 {q12, q13}, [%[bias]]! \n\t" + "vmla.f32 q12, q4, q6 \n\t" + "vmla.f32 q13, q5, q7 \n\t" + "vmax.f32 q12, q12, q14 \n\t" + "vmax.f32 q13, q13, q14 \n\t" + "vst1.32 {q12, q13}, [%[C]]! \n\t" + + "subs r5, r5, #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "subs r6, r6, #1 \n\t" + "blt end_nc2_%= \n\t" + "loop_nc2_%=: \n\t" + + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" + + "subs r6, r6, #1 \n\t" + "bge loop_nc2_%= \n\t" + "end_nc2_%=: \n\t" + + "cmp %[nc3], #16 \n\t" + "beq end_nc3_%= \n\t" + + "sub %[c], %[c], %[nc3] \n\t" + "sub %[scale], %[scale], %[nc3] \n\t" + "sub %[bias], %[bias], %[nc3] \n\t" + "sub %[C], %[C], %[nc3] \n\t" + + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" + "end_nc3_%=: \n\t" + + "add %[c], %[c], %[step1] \n\t" + "add %[scale], %[scale], %[step] \n\t" + "add %[bias], %[bias], %[step] \n\t" + "add %[C], %[C], %[step] \n\t" + + "subs %[mc], %[mc], #1 \n\t" + "bge loop_mc_%= \n\t" + "end_mc_%=: \n\t" + + : + : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2), + [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), + [scale] "r"(scale), [bias] "r"(bias) + : "memory", "r5", "r6", "r7", "r8", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q10", "q11", "q12", "q13", "q14"); +} + +// C = A * B +void VecWriteBasic(int n, float *c, float *C, int ldc) { + int nc1 = n / 16; + int _nc1 = n % 16; + int nc2 = _nc1 / 4; + int nc3 = 16 - 4 * (_nc1 % 4); + + asm volatile( + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vst1.32 {q0, q1}, [%[C]]! \n\t" + + "vld1.32 {q2, q3}, [%[c]]! \n\t" + "vst1.32 {q2, q3}, [%[C]]! \n\t" + + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "blt end_nc2_%= \n\t" + "loop_nc2_%=: \n\t" + + "vld1.32 {q4}, [%[c]]! \n\t" + "vst1.32 {q4}, [%[C]]! \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "bge loop_nc2_%= \n\t" + "end_nc2_%=: \n\t" + + "cmp %[nc3], #16 \n\t" + "beq end_nc3_%= \n\t" + "sub %[c], %[c], %[nc3] \n\t" + "sub %[C], %[C], %[nc3] \n\t" + "vld1.32 {q5}, [%[c]]! \n\t" + "vst1.32 {q5}, [%[C]]! \n\t" + "end_nc3_%=: \n\t" + + : + : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5"); +} + +// C = alpha * A * B + beta * C +void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {} + +// C = A * B + C +void VecWriteWithAdd(int n, float *c, float *C, int ldc) { + int nc1 = n / 16; + int _nc1 = n % 16; + + asm volatile( + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vld1.32 {q2, q3}, [%[C]] \n\t" + "vadd.f32 q10, q0, q2 \n\t" + "vadd.f32 q11, q1, q3 \n\t" + "vst1.32 {q10, q11}, [%[C]]! \n\t" + + "vld1.32 {q4, q5}, [%[c]]! \n\t" + "vld1.32 {q6, q7}, [%[C]] \n\t" + "vadd.f32 q12, q4, q6 \n\t" + "vadd.f32 q13, q5, q7 \n\t" + "vst1.32 {q12, q13}, [%[C]]! \n\t" + + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + : [C] "+r"(C), [c] "+r"(c) + : [nc1] "r"(nc1) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", + "q12", "q13"); + + if (_nc1 != 0) { + for (int j = 0; j < _nc1; j++) { + *C++ += *c++; + } + } +} + +// C = A * B + C, relu(C) +void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { + int nc1 = n / 16; + int _nc1 = n % 16; + + asm volatile( + "vmov.f32 q14, #0.0 \n\t" + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vld1.32 {q2, q3}, [%[C]] \n\t" + "vadd.f32 q10, q0, q2 \n\t" + "vadd.f32 q11, q1, q3 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vmax.f32 q11, q11, q14 \n\t" + "vst1.32 {q10, q11}, [%[C]]! \n\t" + + "vld1.32 {q4, q5}, [%[c]]! \n\t" + "vld1.32 {q6, q7}, [%[C]] \n\t" + "vadd.f32 q12, q4, q6 \n\t" + "vadd.f32 q13, q5, q7 \n\t" + "vmax.f32 q12, q12, q14 \n\t" + "vmax.f32 q13, q13, q14 \n\t" + "vst1.32 {q12, q13}, [%[C]]! \n\t" + + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + : [C] "+r"(C), [c] "+r"(c) + : [nc1] "r"(nc1) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", + "q12", "q13"); + + if (_nc1 != 0) { + for (int j = 0; j < _nc1; j++) { + *C += *c; + if (*C < 0) { + *C = 0; + } + C++; + c++; + } + } +} + +// C = A * B, batchnorm(C) +void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, + float *bias) { + int nc1 = n / 16; + int _nc1 = n % 16; + int nc2 = _nc1 / 4; + int nc3 = 16 - 4 * (_nc1 % 4); + + asm volatile( + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vld1.32 {q2, q3}, [%[scale]]! \n\t" + "vld1.32 {q10, q11}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q2 \n\t" + "vmla.f32 q11, q1, q3 \n\t" + "vst1.32 {q10, q11}, [%[C]]! \n\t" + + "vld1.32 {q4, q5}, [%[c]]! \n\t" + "vld1.32 {q6, q7}, [%[scale]]! \n\t" + "vld1.32 {q12, q13}, [%[bias]]! \n\t" + "vmla.f32 q12, q4, q6 \n\t" + "vmla.f32 q13, q5, q7 \n\t" + "vst1.32 {q12, q13}, [%[C]]! \n\t" + + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "blt end_nc2_%= \n\t" + "loop_nc2_%=: \n\t" + + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "bge loop_nc2_%= \n\t" + "end_nc2_%=: \n\t" + + "cmp %[nc3], #16 \n\t" + "beq end_nc3_%= \n\t" + + "sub %[c], %[c], %[nc3] \n\t" + "sub %[scale], %[scale], %[nc3] \n\t" + "sub %[bias], %[bias], %[nc3] \n\t" + "sub %[C], %[C], %[nc3] \n\t" + + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" + "end_nc3_%=: \n\t" + + : + : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3), + [scale] "r"(scale), [bias] "r"(bias) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", + "q12", "q13"); +} + +// C = A * B, batchnorm(C), relu(C) +void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale, + float *bias) { + int nc1 = n / 16; + int _nc1 = n % 16; + int nc2 = _nc1 / 4; + int nc3 = 16 - 4 * (_nc1 % 4); + + asm volatile( + "vmov.f32 q14, #0.0 \n\t" + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vld1.32 {q2, q3}, [%[scale]]! \n\t" + "vld1.32 {q10, q11}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q2 \n\t" + "vmla.f32 q11, q1, q3 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vmax.f32 q11, q11, q14 \n\t" + "vst1.32 {q10, q11}, [%[C]]! \n\t" + + "vld1.32 {q4, q5}, [%[c]]! \n\t" + "vld1.32 {q6, q7}, [%[scale]]! \n\t" + "vld1.32 {q12, q13}, [%[bias]]! \n\t" + "vmla.f32 q12, q4, q6 \n\t" + "vmla.f32 q13, q5, q7 \n\t" + "vmax.f32 q12, q12, q14 \n\t" + "vmax.f32 q13, q13, q14 \n\t" + "vst1.32 {q12, q13}, [%[C]]! \n\t" + + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "blt end_nc2_%= \n\t" + "loop_nc2_%=: \n\t" + + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "bge loop_nc2_%= \n\t" + "end_nc2_%=: \n\t" + + "cmp %[nc3], #16 \n\t" + "beq end_nc3_%= \n\t" + + "sub %[c], %[c], %[nc3] \n\t" + "sub %[scale], %[scale], %[nc3] \n\t" + "sub %[bias], %[bias], %[nc3] \n\t" + "sub %[C], %[C], %[nc3] \n\t" + + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" + "end_nc3_%=: \n\t" + + : + : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3), + [scale] "r"(scale), [bias] "r"(bias) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", + "q12", "q13", "q14"); +} + +} // namespace operators +} // namespace paddle_mobile } // namespace paddle_mobile diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index 6d7ae6d2bcdbd7e24cb3c2389dd3cdf09a807892..b4bce43c7a29fba09ade7512cbc660f0ac2888ab 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -19,12 +19,8 @@ limitations under the License. */ #define B(i, j) B[(i)*ldb + (j)] #define C(i, j) C[(i)*ldc + (j)] -// 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k -#define MC 128 -#define KC 128 -#define NC 1024 #define MR 4 -#define NR 4 +#define NR 8 #define s_min(i, j) ((i) < (j) ? (i) : (j)) @@ -49,28 +45,66 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, float *buffer); // 分块矩阵乘法 -void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - int first_time); +void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, + float beta, float *c, float *C, int ldc, bool relu); + +void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, + const float *b, float beta, float *c, float *C, int ldc, + bool relu, float *new_scale, float *new_bias); // 向量矩阵乘法 (M = 1) void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc); - -// 计算一个更小的 4 * 4 的 C 矩阵分块 -void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B, - int ldb, float beta, float *C, int ldc, int mc, int nc); - -void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, - int ldb, float beta, float *C, int ldc, int mc, int nc, - bool relu); + const float *B, int ldb, float beta, float *C, int ldc, + bool relu); + +void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, + int lda, const float *B, int ldb, float beta, float *C, + int ldc, bool relu, float *new_scale, float *new_bias); + +// 计算一个更小的 C 矩阵分块 +void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc); +void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc); + +// 分块矩阵乘法结果回写 +// C = A * B +void WriteBasic(int mc, int nc, float *c, float *C, int ldc); +// C = alpha * A * B + beta * C +void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc); +// C = A * B + C +void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc); +// C = A * B + C, relu(C) +void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc); +// C = A * B, batchnorm(C) +void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, + float *new_bias); +// C = A * B, batchnorm(C), relu(C) +void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, + float *new_scale, float *new_bias); + +// 向量矩阵乘法结果回写 +// C = A * B +void VecWriteBasic(int n, float *c, float *C, int ldc); +// C = alpha * A * B + beta * C +void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc); +// C = A * B + C +void VecWriteWithAdd(int n, float *c, float *C, int ldc); +// C = A * B + C, relu(C) +void VecWriteWithAddRelu(int n, float *c, float *C, int ldc); +// C = A * B, batchnorm(C) +void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale, + float *new_bias); +// C = A * B, batchnorm(C), relu(C) +void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale, + float *new_bias); // 32位 float 矩阵乘法 -void sgemm(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc); +void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc, bool relu); -void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc); +// 32位 float 矩阵乘法, 并对结果进行 batchnrom +void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc, + bool relu, float *new_scale, float *new_bias); // 64位 double 矩阵乘法 void dgemm(int m, int n, int k, float alpha, const double *A, int lda, diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp index fd4106038c7446e659736c6b3c61b5aa05127e72..ca5367788ed87da070dd19900e8d546e51caf337 100644 --- a/src/operators/math/math_function.cpp +++ b/src/operators/math/math_function.cpp @@ -39,22 +39,18 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, int M = dim_out[0]; int N = dim_out[1]; - int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + int K = (!trans_a) ? dim_a[1] : dim_a[0]; - if (relu) { - sgemm_relu(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), N); - } else { - sgemm(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, - beta, matrix_out->data(), N); - } + Sgemm(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, + beta, matrix_out->data(), N, relu); } template <> -void matmul(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, - double alpha, framework::Tensor *matrix_out, double beta, - bool relu) { +void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, + const framework::Tensor &matrix_b, bool trans_b, + float alpha, framework::Tensor *matrix_out, float beta, + bool relu, framework::Tensor *new_scale, + framework::Tensor *new_bias) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -71,7 +67,11 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, int M = dim_out[0]; int N = dim_out[1]; - int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + int K = (!trans_a) ? dim_a[1] : dim_a[0]; + + SgemmWithBn(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), + N, beta, matrix_out->data(), N, relu, + new_scale->data(), new_bias->data()); } } // namespace math diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h index 0b953ec6a3b2a03a94a91884b9daf3ed88523a22..0ca7815fc2bcff2be0345b581d3dfb26cf55794c 100644 --- a/src/operators/math/math_function.h +++ b/src/operators/math/math_function.h @@ -26,6 +26,12 @@ template void matmul(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, T alpha, framework::Tensor *matrix_out, T beta, bool relu = false); + +template +void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, + const framework::Tensor &matrix_b, bool trans_b, T alpha, + framework::Tensor *matrix_out, T beta, bool relu, + framework::Tensor *new_scale, framework::Tensor *new_bias); } // namespace math } // namespace operators } // namespace paddle_mobile