提交 2b5c8381 编写于 作者: Z zhaojiaying01

Optimize Gemm: fuse add relu batchnorm op, dynamic block, add AddDot4x8,...

Optimize Gemm: fuse add relu batchnorm op, dynamic block, add AddDot4x8, optimize memory write back.
上级 f6d34bd0
...@@ -22,9 +22,14 @@ limitations under the License. */ ...@@ -22,9 +22,14 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
alignas(64) float packedA[MC * KC]; int MC = 0;
alignas(64) float packedB[KC * NC]; int KC = 0;
alignas(64) float ab[MR * NR]; int NC = 0;
float *packedA;
float *packedB;
float *packedC;
float *zero;
// 将A矩阵分块复制到连续内存(ColMajor) // 将A矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
...@@ -55,28 +60,39 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, ...@@ -55,28 +60,39 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
// 将A矩阵分块复制到连续内存(RowMajor) // 将A矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
int i, j; const float *a0, *a1, *a2, *a3;
const float *Ai, *Ai1, *Ai2, *Ai3; for (int i = 0; i < m - m_tail; i += MR) {
for (i = 0; i < m - m_tail; i += MR) { a0 = A + i * lda;
Ai = &A(i, 0); a1 = A + (i + 1) * lda;
Ai1 = &A(i + 1, 0); a2 = A + (i + 2) * lda;
Ai2 = &A(i + 2, 0); a3 = A + (i + 3) * lda;
Ai3 = &A(i + 3, 0);
for (int j = 0; j < k; ++j) { for (int j = 0; j < k; ++j) {
*buffer++ = *Ai++; *buffer++ = *a0++;
*buffer++ = *Ai1++; *buffer++ = *a1++;
*buffer++ = *Ai2++; *buffer++ = *a2++;
*buffer++ = *Ai3++; *buffer++ = *a3++;
} }
} }
int i = m - m_tail;
a0 = &A(i, 0);
a1 = a0 + lda;
a2 = a0 + 2 * lda;
a3 = a0 + 3 * lda;
if (m_tail != 0) { if (m_tail != 0) {
for (j = 0; j < k; ++j) { if (m_tail <= 3) {
for (i = m - m_tail; i < m; ++i) { a3 = zero;
*buffer++ = A(i, j);
} }
for (i = m; i < m + (MR - m_tail); ++i) { if (m_tail <= 2) {
*buffer++ = 0; a2 = zero;
} }
if (m_tail <= 1) {
a1 = zero;
}
for (int j = 0; j < k; ++j) {
*buffer++ = *a0++;
*buffer++ = *a1++;
*buffer++ = *a2++;
*buffer++ = *a3++;
} }
} }
} }
...@@ -113,35 +129,24 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -113,35 +129,24 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 将B矩阵分块复制到连续内存(RowMajor) // 将B矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
int i, j; const float *b0;
const float *Bij; for (int j = 0; j < n - n_tail; j += NR) {
for (j = 0; j < n - n_tail; j += NR) { for (int i = 0; i < k; ++i) {
#ifdef ARMV7 b0 = &B(i, j);
for (i = 0; i < k; ++i) {
Bij = &B(i, j);
asm volatile( asm volatile(
"vld1.32 {q0}, [%[Bij]] \n\t" "pld [%[b0]] \n\t"
"vst1.32 {q0}, [%[buffer]]! \n\t" "vld1.32 {q0, q1}, [%[b0]] \n\t"
"vst1.32 {q0, q1}, [%[buffer]]! \n\t"
: [buffer] "+r"(buffer) : [buffer] "+r"(buffer)
: [Bij] "r"(Bij) : [b0] "r"(b0)
: "memory", "q0"); : "memory", "q0", "q0");
}
#else
for (i = 0; i < k; ++i) {
Bij = &B(i, j);
*buffer++ = *Bij;
*buffer++ = *(Bij + 1);
*buffer++ = *(Bij + 2);
*buffer++ = *(Bij + 3);
} }
#endif
} }
if (n_tail != 0) { if (n_tail != 0) {
for (i = 0; i < k; ++i) { for (int i = 0; i < k; ++i) {
Bij = &B(i, n - n_tail); b0 = &B(i, n - n_tail);
for (int j = n - n_tail; j < n; ++j) { for (int j = n - n_tail; j < n; ++j) {
*buffer++ = *Bij++; *buffer++ = *b0++;
} }
for (int j = n; j < n + (NR - n_tail); ++j) { for (int j = n; j < n + (NR - n_tail); ++j) {
*buffer++ = 0; *buffer++ = 0;
...@@ -151,118 +156,53 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, ...@@ -151,118 +156,53 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda, void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
const float *B, int ldb, float beta, float *C, int ldc, float beta, float *c, float *C, int ldc, bool relu) {
int first_time) { for (int j = 0; j < nc; j += NR) {
int m_block = (m + MR - 1) / MR * MR; for (int i = 0; i < mc; i += MR) {
int n_block = (n + NR - 1) / NR * NR; // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
int m_tail = m % MR;
int n_tail = n % NR;
if (first_time) {
PackMatrixB_(k, n, n_tail, B, ldb, packedB);
}
PackMatrixA_(m, k, m_tail, A, lda, packedA);
int i, j, mc, nc;
// B 取 4 列, 打包预热
for (j = 0; j < n_block; j += NR) {
nc = (n - j) < NR ? n_tail : NR;
// A 取 4 行,打包预热
for (i = 0; i < m_block; i += MR) {
mc = (m - i) < MR ? m_tail : MR;
AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
&C(i, j), ldc, mc, nc);
} }
} }
}
// 分块矩阵乘法
void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
int first_time, bool relu = false) {
int m_block = (m + MR - 1) / MR * MR;
int n_block = (n + NR - 1) / NR * NR;
int m_tail = m % MR;
int n_tail = n % NR;
if (first_time) { if (alpha != 1) {
PackMatrixB_(k, n, n_tail, B, ldb, packedB); WriteWithAlphaBeta(mc, nc, c, C, ldc);
return;
} }
PackMatrixA_(m, k, m_tail, A, lda, packedA); if (beta == 0) {
WriteBasic(mc, nc, c, C, ldc);
int i, j, mc, nc; return;
}
// B 取 4 列, 打包预热 if (beta == 1 && !relu) {
for (j = 0; j < n_block; j += NR) { WriteWithAdd(mc, nc, c, C, ldc);
nc = (n - j) < NR ? n_tail : NR; return;
// A 取 4 行,打包预热
for (i = 0; i < m_block; i += MR) {
mc = (m - i) < MR ? m_tail : MR;
AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
&C(i, j), ldc, mc, nc, relu);
} }
if (beta == 1 && relu) {
WriteWithAddRelu(mc, nc, c, C, ldc);
return;
} }
} }
// 计算一个更小的 4 * 4 的 C 矩阵分块 // 分块矩阵乘法
#if defined(IOS) void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b, const float *b, float beta, float *c, float *C, int ldc,
int ldb, float beta, float *C, int ldc, int mc, int nc) { bool relu, float *new_scale, float *new_bias) {
// init C for (int j = 0; j < nc; j += NR) {
float32x4_t cv0 = vdupq_n_f32(0.0); for (int i = 0; i < mc; i += MR) {
float32x4_t cv1 = vdupq_n_f32(0.0); // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
float32x4_t cv2 = vdupq_n_f32(0.0); AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
float32x4_t cv3 = vdupq_n_f32(0.0);
float32x4_t av;
float32x4_t bv;
float32x2_t av01;
float32x2_t av23;
for (int p = 0; p < k; p += 1) {
av = vld1q_f32(a);
bv = vld1q_f32(b);
av01 = vget_low_f32(av);
cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
av23 = vget_high_f32(av);
cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
a += MR;
b += NR;
}
float32x4x4_t cv = {cv0, cv1, cv2, cv3};
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
C(i, j) = 0.0;
} else if (beta != 1.0) {
C(i, j) *= beta;
}
if (j == 0) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
} else if (j == 1) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
} else if (j == 2) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
} else if (j == 3) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
} }
} }
if (relu) {
WriteWithBnRelu(mc, nc, c, C, ldc, new_scale, new_bias);
} else {
WriteWithBn(mc, nc, c, C, ldc, new_scale, new_bias);
} }
} }
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, #if defined(IOS)
int ldb, float beta, float *C, int ldc, int mc, int nc, void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
bool relu = false) {
// init C // init C
float32x4_t cv0 = vdupq_n_f32(0.0); float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0); float32x4_t cv1 = vdupq_n_f32(0.0);
...@@ -307,183 +247,22 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, ...@@ -307,183 +247,22 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
} else if (j == 3) { } else if (j == 3) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3); C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
} }
if (C(i, j) < 0) {
C(i, j) = 0;
}
} }
} }
} }
} // namespace math
#elif defined(ARMV7) #elif defined(ARMV7)
void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b, void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
int ldb, float beta, float *C, int ldc, int mc, int nc) { const float *a_ptr, *b_ptr;
int kc1 = k / 4, kc2 = k % 4; a_ptr = a;
int bytes_ldc = 4 * ldc; b_ptr = b;
int flag_alpha = (alpha == 1.0) ? 1 : 2; int kc1 = k / 4;
int flag_beta; int kc2 = k % 4;
if (beta == 0.0) { int step = 4 * ldc;
flag_beta = 0;
} else if (beta == 1.0) {
flag_beta = 1;
} else {
flag_beta = 2;
}
asm volatile(
"pld [%[a]] \n\t"
"pld [%[b]] \n\t"
"vmov.f32 q10, #0.0 \n\t"
"vmov.f32 q11, #0.0 \n\t"
"vmov.f32 q12, #0.0 \n\t"
"vmov.f32 q13, #0.0 \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"blt end_kc1_%= \n\t"
"loop_kc1_%=: \n\t"
"pld [%[a], #64] \n\t"
"pld [%[b], #64] \n\t"
"vld1.32 {q0, q1}, [%[a]]! \n\t"
"vld1.32 {q2, q3}, [%[b]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q2, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q2, d1[1] \n\t"
"vmla.f32 q10, q3, d2[0] \n\t"
"vmla.f32 q11, q3, d2[1] \n\t"
"vmla.f32 q12, q3, d3[0] \n\t"
"vmla.f32 q13, q3, d3[1] \n\t"
"vld1.32 {q0, q1}, [%[a]]! \n\t"
"vld1.32 {q2, q3}, [%[b]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q2, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q2, d1[1] \n\t"
"vmla.f32 q10, q3, d2[0] \n\t"
"vmla.f32 q11, q3, d2[1] \n\t"
"vmla.f32 q12, q3, d3[0] \n\t"
"vmla.f32 q13, q3, d3[1] \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"bge loop_kc1_%= \n\t"
"end_kc1_%=: \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"blt end_kc2_%= \n\t"
"loop_kc2_%=: \n\t"
"vld1.32 {q0}, [%[a]]! \n\t"
"vld1.32 {q1}, [%[b]]! \n\t"
"vmla.f32 q10, q1, d0[0] \n\t"
"vmla.f32 q11, q1, d0[1] \n\t"
"vmla.f32 q12, q1, d1[0] \n\t"
"vmla.f32 q13, q1, d1[1] \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"bge loop_kc2_%= \n\t"
"end_kc2_%=: \n\t"
"cmp %[mc], #4 \n\t"
"bne temp_%= \n\t"
"cmp %[nc], #4 \n\t"
"bne temp_%= \n\t"
"vmov.f32 d8[0], %[alpha] \n\t"
"vmov.f32 d8[1], %[beta] \n\t"
"cmp %[flag_alpha], #1 \n\t"
"bne alpha_%= \n\t"
"alpha_%=: \n\t"
"vmul.f32 q10, q10, d8[0] \n\t"
"vmul.f32 q11, q11, d8[0] \n\t"
"vmul.f32 q12, q12, d8[0] \n\t"
"vmul.f32 q13, q13, d8[0] \n\t"
"beta_%=: \n\t"
"cmp %[flag_beta], #0 \n\t"
"beq memory_%= \n\t"
"mov r4, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t"
"vld1.32 {q0}, [r4], r6 \n\t"
"vld1.32 {q1}, [r4], r6 \n\t"
"vld1.32 {q2}, [r4], r6 \n\t"
"vld1.32 {q3}, [r4] \n\t"
"cmp %[flag_beta], #1 \n\t"
"beq beta_eq1_%= \n\t"
"bne beta_ne1_%= \n\t"
"beta_eq1_%=: \n\t"
"vadd.f32 q10, q10, q0 \n\t"
"vadd.f32 q11, q11, q1 \n\t"
"vadd.f32 q12, q12, q2 \n\t"
"vadd.f32 q13, q13, q3 \n\t"
"b memory_%= \n\t"
"beta_ne1_%=: \n\t"
"vmla.f32 q10, q0, d8[1] \n\t"
"vmla.f32 q11, q1, d8[1] \n\t"
"vmla.f32 q12, q2, d8[1] \n\t"
"vmla.f32 q13, q3, d8[1] \n\t"
"memory_%=: \n\t"
"mov r5, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t"
"vst1.32 {q10}, [r5], r6 \n\t"
"vst1.32 {q11}, [r5], r6 \n\t"
"vst1.32 {q12}, [r5], r6 \n\t"
"vst1.32 {q13}, [r5] \n\t"
"b end_%= \n\t"
"temp_%=: \n\t"
"vst1.32 {q10, q11}, [%[ab]]!\n\t"
"vst1.32 {q12, q13}, [%[ab]] \n\t"
"end_%=: \n\t"
:
: [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
[kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
[beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
[flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
: "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
if (mc != MR || nc != NR) {
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
if (alpha != 1.0) {
C(i, j) = alpha * ab[i * MR + j];
} else {
C(i, j) = ab[i * MR + j];
}
} else {
if (beta != 1.0) {
C(i, j) *= beta;
}
if (alpha != 1.0) {
C(i, j) += alpha * ab[i * MR + j];
} else {
C(i, j) += ab[i * MR + j];
}
}
}
}
}
}
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc,
bool relu = false) {
int kc1 = k / 4, kc2 = k % 4;
int bytes_ldc = 4 * ldc;
int flag_alpha = (alpha == 1.0) ? 1 : 2;
int flag_beta;
if (beta == 0.0) {
flag_beta = 0;
} else if (beta == 1.0) {
flag_beta = 1;
} else {
flag_beta = 2;
}
asm volatile( asm volatile(
"pld [%[a]] \n\t" "pld [%[a_ptr]] \n\t"
"pld [%[b]] \n\t" "pld [%[b_ptr]] \n\t"
"vmov.f32 q10, #0.0 \n\t" "vmov.f32 q10, #0.0 \n\t"
"vmov.f32 q11, #0.0 \n\t" "vmov.f32 q11, #0.0 \n\t"
"vmov.f32 q12, #0.0 \n\t" "vmov.f32 q12, #0.0 \n\t"
...@@ -492,20 +271,10 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, ...@@ -492,20 +271,10 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"subs %[kc1], %[kc1], #1 \n\t" "subs %[kc1], %[kc1], #1 \n\t"
"blt end_kc1_%= \n\t" "blt end_kc1_%= \n\t"
"loop_kc1_%=: \n\t" "loop_kc1_%=: \n\t"
"pld [%[a], #64] \n\t" "pld [%[a_ptr], #64] \n\t"
"pld [%[b], #64] \n\t" "pld [%[b_ptr], #64] \n\t"
"vld1.32 {q0, q1}, [%[a]]! \n\t" "vld1.32 {q0, q1}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b]]! \n\t" "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q2, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q2, d1[1] \n\t"
"vmla.f32 q10, q3, d2[0] \n\t"
"vmla.f32 q11, q3, d2[1] \n\t"
"vmla.f32 q12, q3, d3[0] \n\t"
"vmla.f32 q13, q3, d3[1] \n\t"
"vld1.32 {q0, q1}, [%[a]]! \n\t"
"vld1.32 {q2, q3}, [%[b]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t" "vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q2, d0[1] \n\t" "vmla.f32 q11, q2, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t" "vmla.f32 q12, q2, d1[0] \n\t"
...@@ -514,6 +283,16 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, ...@@ -514,6 +283,16 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"vmla.f32 q11, q3, d2[1] \n\t" "vmla.f32 q11, q3, d2[1] \n\t"
"vmla.f32 q12, q3, d3[0] \n\t" "vmla.f32 q12, q3, d3[0] \n\t"
"vmla.f32 q13, q3, d3[1] \n\t" "vmla.f32 q13, q3, d3[1] \n\t"
"vld1.32 {q4, q5}, [%[a_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[b_ptr]]! \n\t"
"vmla.f32 q10, q6, d8[0] \n\t"
"vmla.f32 q11, q6, d8[1] \n\t"
"vmla.f32 q12, q6, d9[0] \n\t"
"vmla.f32 q13, q6, d9[1] \n\t"
"vmla.f32 q10, q7, d10[0] \n\t"
"vmla.f32 q11, q7, d10[1] \n\t"
"vmla.f32 q12, q7, d11[0] \n\t"
"vmla.f32 q13, q7, d11[1] \n\t"
"subs %[kc1], %[kc1], #1 \n\t" "subs %[kc1], %[kc1], #1 \n\t"
"bge loop_kc1_%= \n\t" "bge loop_kc1_%= \n\t"
"end_kc1_%=: \n\t" "end_kc1_%=: \n\t"
...@@ -521,8 +300,8 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, ...@@ -521,8 +300,8 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"subs %[kc2], %[kc2], #1 \n\t" "subs %[kc2], %[kc2], #1 \n\t"
"blt end_kc2_%= \n\t" "blt end_kc2_%= \n\t"
"loop_kc2_%=: \n\t" "loop_kc2_%=: \n\t"
"vld1.32 {q0}, [%[a]]! \n\t" "vld1.32 {q0}, [%[a_ptr]]! \n\t"
"vld1.32 {q1}, [%[b]]! \n\t" "vld1.32 {q1}, [%[b_ptr]]! \n\t"
"vmla.f32 q10, q1, d0[0] \n\t" "vmla.f32 q10, q1, d0[0] \n\t"
"vmla.f32 q11, q1, d0[1] \n\t" "vmla.f32 q11, q1, d0[1] \n\t"
"vmla.f32 q12, q1, d1[0] \n\t" "vmla.f32 q12, q1, d1[0] \n\t"
...@@ -531,290 +310,168 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, ...@@ -531,290 +310,168 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"bge loop_kc2_%= \n\t" "bge loop_kc2_%= \n\t"
"end_kc2_%=: \n\t" "end_kc2_%=: \n\t"
"cmp %[mc], #4 \n\t" "mov r5, %[c] \n\t"
"bne temp_%= \n\t" "mov r6, %[step] \n\t"
"cmp %[nc], #4 \n\t"
"bne temp_%= \n\t"
"vmov.f32 d8[0], %[alpha] \n\t"
"vmov.f32 d8[1], %[beta] \n\t"
"cmp %[flag_alpha], #1 \n\t"
"bne alpha_%= \n\t"
"alpha_%=: \n\t"
"vmul.f32 q10, q10, d8[0] \n\t"
"vmul.f32 q11, q11, d8[0] \n\t"
"vmul.f32 q12, q12, d8[0] \n\t"
"vmul.f32 q13, q13, d8[0] \n\t"
"beta_%=: \n\t"
"cmp %[flag_beta], #0 \n\t"
"beq memory_%= \n\t"
"mov r4, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t"
"vld1.32 {q0}, [r4], r6 \n\t"
"vld1.32 {q1}, [r4], r6 \n\t"
"vld1.32 {q2}, [r4], r6 \n\t"
"vld1.32 {q3}, [r4] \n\t"
"cmp %[flag_beta], #1 \n\t"
"beq beta_eq1_%= \n\t"
"bne beta_ne1_%= \n\t"
"beta_eq1_%=: \n\t"
"vadd.f32 q10, q10, q0 \n\t"
"vadd.f32 q11, q11, q1 \n\t"
"vadd.f32 q12, q12, q2 \n\t"
"vadd.f32 q13, q13, q3 \n\t"
"b memory_%= \n\t"
"beta_ne1_%=: \n\t"
"vmla.f32 q10, q0, d8[1] \n\t"
"vmla.f32 q11, q1, d8[1] \n\t"
"vmla.f32 q12, q2, d8[1] \n\t"
"vmla.f32 q13, q3, d8[1] \n\t"
"memory_%=: \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vmax.f32 q11, q11, q14 \n\t"
"vmax.f32 q12, q12, q14 \n\t"
"vmax.f32 q13, q13, q14 \n\t"
"mov r5, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t"
"vst1.32 {q10}, [r5], r6 \n\t" "vst1.32 {q10}, [r5], r6 \n\t"
"vst1.32 {q11}, [r5], r6 \n\t" "vst1.32 {q11}, [r5], r6 \n\t"
"vst1.32 {q12}, [r5], r6 \n\t" "vst1.32 {q12}, [r5], r6 \n\t"
"vst1.32 {q13}, [r5] \n\t" "vst1.32 {q13}, [r5] \n\t"
"b end_%= \n\t"
"temp_%=: \n\t"
"vst1.32 {q10, q11}, [%[ab]]!\n\t"
"vst1.32 {q12, q13}, [%[ab]] \n\t"
"end_%=: \n\t"
: :
: [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1), : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha), [kc2] "r"(kc2), [step] "r"(step)
[beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc), : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
[flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta) "q10", "q11", "q12", "q13");
: "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13",
"q14");
if (mc != MR || nc != NR) {
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
if (alpha != 1.0) {
C(i, j) = alpha * ab[i * MR + j];
} else {
C(i, j) = ab[i * MR + j];
}
} else {
if (beta != 1.0) {
C(i, j) *= beta;
}
if (alpha != 1.0) {
C(i, j) += alpha * ab[i * MR + j];
} else {
C(i, j) += ab[i * MR + j];
}
}
if (relu) {
if (C(i, j) < 0) {
C(i, j) = 0;
}
}
}
}
}
} }
#else #else
void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b, void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
int ldb, float beta, float *C, int ldc, int mc, int nc) { float *c0, *c1, *c2, *c3;
float c[16] = {0}; c0 = c;
float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3; c1 = c + ldc;
c2 = c + 2 * ldc;
c3 = c + 3 * ldc;
for (int p = 0; p < k; p += 1) { for (int p = 0; p < k; p += 1) {
reg_b0 = *b++;
reg_b1 = *b++;
reg_b2 = *b++;
reg_b3 = *b++;
reg_a0 = *a++;
reg_a1 = *a++;
reg_a2 = *a++;
reg_a3 = *a++;
// first row // first row
c[0] += reg_a0 * reg_b0; c0[0] += a[0] * b[0];
c[1] += reg_a0 * reg_b1; c0[1] += a[0] * b[1];
c[2] += reg_a0 * reg_b2; c0[2] += a[0] * b[2];
c[3] += reg_a0 * reg_b3; c0[3] += a[0] * b[3];
// second row // second row
c[4] += reg_a1 * reg_b0; c1[0] += a[1] * b[0];
c[5] += reg_a1 * reg_b1; c1[1] += a[1] * b[1];
c[6] += reg_a1 * reg_b2; c1[2] += a[1] * b[2];
c[7] += reg_a1 * reg_b3; c1[3] += a[1] * b[3];
// third row // third row
c[8] += reg_a2 * reg_b0; c2[0] += a[2] * b[0];
c[9] += reg_a2 * reg_b1; c2[1] += a[2] * b[1];
c[10] += reg_a2 * reg_b2; c2[2] += a[2] * b[2];
c[11] += reg_a2 * reg_b3; c2[3] += a[2] * b[3];
// fourth row // fourth row
c[12] += reg_a3 * reg_b0; c3[0] += a[3] * b[0];
c[13] += reg_a3 * reg_b1; c3[1] += a[3] * b[1];
c[14] += reg_a3 * reg_b2; c3[2] += a[3] * b[2];
c[15] += reg_a3 * reg_b3; c3[3] += a[3] * b[3];
}
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
C(i, j) = 0.0;
} else if (beta != 1.0) {
C(i, j) *= beta;
}
if (alpha != 1.0) {
C(i, j) += alpha * c[i * MR + j];
} else {
C(i, j) += c[i * MR + j];
}
}
}
}
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc,
bool relu) {
float c[16] = {0};
float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
for (int p = 0; p < k; p += 1) {
reg_b0 = *b++;
reg_b1 = *b++;
reg_b2 = *b++;
reg_b3 = *b++;
reg_a0 = *a++;
reg_a1 = *a++;
reg_a2 = *a++;
reg_a3 = *a++;
// first row
c[0] += reg_a0 * reg_b0;
c[1] += reg_a0 * reg_b1;
c[2] += reg_a0 * reg_b2;
c[3] += reg_a0 * reg_b3;
// second row
c[4] += reg_a1 * reg_b0;
c[5] += reg_a1 * reg_b1;
c[6] += reg_a1 * reg_b2;
c[7] += reg_a1 * reg_b3;
// third row
c[8] += reg_a2 * reg_b0;
c[9] += reg_a2 * reg_b1;
c[10] += reg_a2 * reg_b2;
c[11] += reg_a2 * reg_b3;
// fourth row a += 4;
c[12] += reg_a3 * reg_b0; b += 4;
c[13] += reg_a3 * reg_b1;
c[14] += reg_a3 * reg_b2;
c[15] += reg_a3 * reg_b3;
}
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
C(i, j) = 0.0;
} else if (beta != 1.0) {
C(i, j) *= beta;
}
if (alpha != 1.0) {
C(i, j) += alpha * c[i * MR + j];
} else {
C(i, j) += c[i * MR + j];
}
if (relu) {
if (C(i, j) < 0) {
C(i, j) = 0;
}
}
}
} }
} }
#endif #endif
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc) { const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
int i, j, p, mc, nc, kc; // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
float beta_; // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 30 * 1024;
#ifdef ARMV7 int L2 = 1 * 1024 * 1024;
if (m == 1) {
VectorKernel(1, n, k, alpha, A, lda, B, ldb, beta, C, ldc); KC = k;
return; MC = L2 / (2 * KC * sizeof(float));
} NC = MC;
#endif
// make sure MC is multiple of 4, and NC is multiple of 8
for (j = 0; j < n; j += NC) { int mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + 4 - 1) / 4 * 4;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + 8 - 1) / 8 * 8;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
packedC = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
for (int l = 0; l < KC; ++l) {
zero[l] = 0;
}
int mc, nc;
for (int j = 0; j < n; j += NC) {
nc = s_min(n - j, NC); nc = s_min(n - j, NC);
for (p = 0; p < k; p += KC) { PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
kc = s_min(k - p, KC); for (int i = 0; i < m; i += MC) {
for (i = 0; i < m; i += MC) {
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
if (p != 0) { PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
beta_ = 1.0; InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
} else { relu);
beta_ = beta;
}
InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
&C(i, j), ldc, i == 0);
}
} }
} }
paddle_mobile::memory::Free(packedA);
paddle_mobile::memory::Free(packedB);
paddle_mobile::memory::Free(packedC);
paddle_mobile::memory::Free(zero);
} }
void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc) { const float *B, int ldb, float beta, float *C, int ldc,
int i, j, p, mc, nc, kc; bool relu, float *new_scale, float *new_bias) {
float beta_; // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
for (j = 0; j < n; j += NC) { // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 30 * 1024;
int L2 = 1 * 1024 * 1024;
KC = k;
MC = L2 / (2 * KC * sizeof(float));
NC = MC;
// make sure MC is multiple of 4, and NC is multiple of 8
int mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + 4 - 1) / 4 * 4;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + 8 - 1) / 8 * 8;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
packedC = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
for (int l = 0; l < KC; ++l) {
zero[l] = 0;
}
int mc, nc;
for (int j = 0; j < n; j += NC) {
nc = s_min(n - j, NC); nc = s_min(n - j, NC);
for (p = 0; p < k; p += KC) { PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
kc = s_min(k - p, KC); for (int i = 0; i < m; i += MC) {
for (i = 0; i < m; i += MC) {
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
if (p != 0) { PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
beta_ = 1.0; InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
} else { &C(i, j), ldc, relu, new_scale + ldc * i + j,
beta_ = beta; new_bias + ldc * i + j);
}
if (p + KC >= k) {
InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb,
beta_, &C(i, j), ldc, i == 0, true);
} else {
InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
&C(i, j), ldc, i == 0);
}
}
} }
} }
paddle_mobile::memory::Free(packedA);
paddle_mobile::memory::Free(packedB);
paddle_mobile::memory::Free(packedC);
paddle_mobile::memory::Free(zero);
} }
#ifdef ARMV7
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc) { const float *B, int ldb, float beta, float *C, int ldc,
bool relu) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n)); float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3; const float *a0, *b0, *b1, *b2, *b3;
...@@ -1016,18 +673,995 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -1016,18 +673,995 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
} }
} }
c0 = bufferC; if (alpha != 1) {
C0 = C; VecWriteWithAlphaBeta(n, bufferC, C, ldc);
for (int i = 0; i < n; i++) { return;
if (beta == 1.0) {
*C0++ += *c0++;
} else {
*C0++ = *c0++;
} }
if (beta == 0) {
VecWriteBasic(n, bufferC, C, ldc);
return;
} }
} if (beta == 1 && !relu) {
#endif VecWriteWithAdd(n, bufferC, C, ldc);
return;
} // namespace math }
if (beta == 1 && relu) {
VecWriteWithAddRelu(n, bufferC, C, ldc);
return;
}
}
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3;
float *c0, *C0;
int volatile kc1 = k / 4;
int volatile kc2 = k % 4;
int volatile nc1 = n / 16;
int _nc1 = n % 16;
int volatile nc2 = _nc1 / 4;
int volatile nc3 = _nc1 % 4;
for (int i = 0; i < kc1; i++) {
a0 = A + i * 4;
b0 = B + i * 4 * ldb;
b1 = b0 + ldb;
b2 = b1 + ldb;
b3 = b2 + ldb;
c0 = bufferC;
asm volatile(
"pld [%[a0], #16] \n\t"
"vld1.32 {q0}, [%[a0]] \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"cmp %[i], #0 \n\t"
"beq i_eq0_%= \n\t"
"bne i_ne0_%= \n\t"
"i_eq0_%=: \n\t"
"vmov.f32 q10, #0.0 \n\t"
"vmov.f32 q11, #0.0 \n\t"
"vmov.f32 q12, #0.0 \n\t"
"vmov.f32 q13, #0.0 \n\t"
"b gemm_nc1_%= \n\t"
"i_ne0_%=: \n\t"
"pld [%[c0], #64] \n\t"
"vld1.32 {q10, q11}, [%[c0]]! \n\t"
"vld1.32 {q12, q13}, [%[c0]] \n\t"
"sub %[c0], %[c0], #32 \n\t"
"gemm_nc1_%=: \n\t"
"pld [%[b0], #64] \n\t"
"vld1.32 {q2, q3}, [%[b0]]! \n\t"
"vld1.32 {q4, q5}, [%[b0]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q3, d0[0] \n\t"
"vmla.f32 q12, q4, d0[0] \n\t"
"vmla.f32 q13, q5, d0[0] \n\t"
"pld [%[b1], #64] \n\t"
"vld1.32 {q2, q3}, [%[b1]]! \n\t"
"vld1.32 {q4, q5}, [%[b1]]! \n\t"
"vmla.f32 q10, q2, d0[1] \n\t"
"vmla.f32 q11, q3, d0[1] \n\t"
"vmla.f32 q12, q4, d0[1] \n\t"
"vmla.f32 q13, q5, d0[1] \n\t"
"pld [%[b2], #64] \n\t"
"vld1.32 {q2, q3}, [%[b2]]! \n\t"
"vld1.32 {q4, q5}, [%[b2]]! \n\t"
"vmla.f32 q10, q2, d1[0] \n\t"
"vmla.f32 q11, q3, d1[0] \n\t"
"vmla.f32 q12, q4, d1[0] \n\t"
"vmla.f32 q13, q5, d1[0] \n\t"
"pld [%[b3], #64] \n\t"
"vld1.32 {q2, q3}, [%[b3]]! \n\t"
"vld1.32 {q4, q5}, [%[b3]]! \n\t"
"vmla.f32 q10, q2, d1[1] \n\t"
"vmla.f32 q11, q3, d1[1] \n\t"
"vmla.f32 q12, q4, d1[1] \n\t"
"vmla.f32 q13, q5, d1[1] \n\t"
"vst1.32 {q10, q11}, [%[c0]]! \n\t"
"vst1.32 {q12, q13}, [%[c0]]! \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"cmp %[i], #0 \n\t"
"beq ii_eq0_%= \n\t"
"bne ii_ne0_%= \n\t"
"ii_eq0_%=: \n\t"
"vmov.f32 q10, #0.0 \n\t"
"b gemm_nc2_%= \n\t"
"ii_ne0_%=: \n\t"
"pld [%[c0], #16] \n\t"
"vld1.32 {q10}, [%[c0]] \n\t"
"gemm_nc2_%=: \n\t"
"pld [%[b0], #16] \n\t"
"vld1.32 {q2}, [%[b0]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"pld [%[b1], #16] \n\t"
"vld1.32 {q3}, [%[b1]]! \n\t"
"vmla.f32 q10, q3, d0[1] \n\t"
"pld [%[b2], #16] \n\t"
"vld1.32 {q4}, [%[b2]]! \n\t"
"vmla.f32 q10, q4, d1[0] \n\t"
"pld [%[b3], #16] \n\t"
"vld1.32 {q5}, [%[b3]]! \n\t"
"vmla.f32 q10, q5, d1[1] \n\t"
"vst1.32 {q10}, [%[c0]]! \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
: [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
[c0] "+r"(c0)
: [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
: "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
for (int j = 0; j < nc3; j++) {
if (i == 0) {
*c0 = (*a0) * (*b0++);
} else {
*c0 += (*a0) * (*b0++);
}
*c0 += (*(a0 + 1)) * (*b1++);
*c0 += (*(a0 + 2)) * (*b2++);
*c0 += (*(a0 + 3)) * (*b3++);
c0++;
}
}
for (int i = 0; i < kc2; ++i) {
a0 = A + 4 * kc1 + i;
b0 = B + (4 * kc1 + i) * ldb;
c0 = bufferC;
asm volatile(
"pld [%[a0], #16] \n\t"
"vld1.32 {d0}, [%[a0]] \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"pld [%[c0], #64] \n\t"
"vld1.32 {q10, q11}, [%[c0]]! \n\t"
"vld1.32 {q12, q13}, [%[c0]] \n\t"
"sub %[c0], %[c0], #32 \n\t"
"gemm_nc1_%=: \n\t"
"pld [%[b0], #64] \n\t"
"vld1.32 {q2, q3}, [%[b0]]! \n\t"
"vld1.32 {q4, q5}, [%[b0]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q3, d0[0] \n\t"
"vmla.f32 q12, q4, d0[0] \n\t"
"vmla.f32 q13, q5, d0[0] \n\t"
"vst1.32 {q10, q11}, [%[c0]]! \n\t"
"vst1.32 {q12, q13}, [%[c0]]! \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"pld [%[c0], #16] \n\t"
"vld1.32 {q10}, [%[c0]] \n\t"
"gemm_nc2_%=: \n\t"
"vld1.32 {q2}, [%[b0]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vst1.32 {q10}, [%[c0]]! \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
: [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
[c0] "+r"(c0)
: [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
: "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
for (int j = 0; j < nc3; j++) {
*c0 += (*a0) * (*b0++);
c0++;
}
}
if (relu) {
VecWriteWithBnRelu(n, bufferC, C, ldc, new_scale, new_bias);
} else {
VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias);
}
}
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
const float *a_ptr, *b_ptr;
a_ptr = a;
b_ptr = b;
int kc1 = k / 4;
int kc2 = k % 4;
int step = 4 * ldc;
asm volatile(
"pld [%[a_ptr]] \n\t"
"pld [%[b_ptr]] \n\t"
"vmov.f32 q8, #0.0 \n\t"
"vmov.f32 q9, #0.0 \n\t"
"vmov.f32 q10, #0.0 \n\t"
"vmov.f32 q11, #0.0 \n\t"
"vmov.f32 q12, #0.0 \n\t"
"vmov.f32 q13, #0.0 \n\t"
"vmov.f32 q14, #0.0 \n\t"
"vmov.f32 q15, #0.0 \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"blt end_kc1_%= \n\t"
"loop_kc1_%=: \n\t"
"pld [%[a_ptr], #64] \n\t"
"pld [%[b_ptr], #64] \n\t"
"vld1.32 {q0, q1}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[b_ptr]]! \n\t"
"vmla.f32 q8, q2, d0[0] \n\t"
"vmla.f32 q9, q3, d0[0] \n\t"
"vmla.f32 q10, q2, d0[1] \n\t"
"vmla.f32 q11, q3, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q3, d1[0] \n\t"
"vmla.f32 q14, q2, d1[1] \n\t"
"vmla.f32 q15, q3, d1[1] \n\t"
"vmla.f32 q8, q4, d2[0] \n\t"
"vmla.f32 q9, q5, d2[0] \n\t"
"vmla.f32 q10, q4, d2[1] \n\t"
"vmla.f32 q11, q5, d2[1] \n\t"
"vmla.f32 q12, q4, d3[0] \n\t"
"vmla.f32 q13, q5, d3[0] \n\t"
"vmla.f32 q14, q4, d3[1] \n\t"
"vmla.f32 q15, q5, d3[1] \n\t"
"pld [%[b_ptr], #64] \n\t"
"vld1.32 {q0, q1}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[b_ptr]]! \n\t"
"vmla.f32 q8, q2, d0[0] \n\t"
"vmla.f32 q9, q3, d0[0] \n\t"
"vmla.f32 q10, q2, d0[1] \n\t"
"vmla.f32 q11, q3, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q3, d1[0] \n\t"
"vmla.f32 q14, q2, d1[1] \n\t"
"vmla.f32 q15, q3, d1[1] \n\t"
"vmla.f32 q8, q4, d2[0] \n\t"
"vmla.f32 q9, q5, d2[0] \n\t"
"vmla.f32 q10, q4, d2[1] \n\t"
"vmla.f32 q11, q5, d2[1] \n\t"
"vmla.f32 q12, q4, d3[0] \n\t"
"vmla.f32 q13, q5, d3[0] \n\t"
"vmla.f32 q14, q4, d3[1] \n\t"
"vmla.f32 q15, q5, d3[1] \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"bge loop_kc1_%= \n\t"
"end_kc1_%=: \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"blt end_kc2_%= \n\t"
"loop_kc2_%=: \n\t"
"vld1.32 {q0}, [%[a_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[b_ptr]]! \n\t"
"vmla.f32 q8, q2, d0[0] \n\t"
"vmla.f32 q9, q3, d0[0] \n\t"
"vmla.f32 q10, q2, d0[1] \n\t"
"vmla.f32 q11, q3, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q3, d1[0] \n\t"
"vmla.f32 q14, q2, d1[1] \n\t"
"vmla.f32 q15, q3, d1[1] \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"bge loop_kc2_%= \n\t"
"end_kc2_%=: \n\t"
"mov r5, %[c] \n\t"
"mov r6, %[step] \n\t"
"vst1.32 {q8, q9}, [r5], r6 \n\t"
"vst1.32 {q10, q11}, [r5], r6 \n\t"
"vst1.32 {q12, q13}, [r5], r6 \n\t"
"vst1.32 {q14, q15}, [r5] \n\t"
:
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[kc2] "r"(kc2), [step] "r"(step)
: "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
// C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16;
int _nc1 = nc % 16;
int step = 4 * ldc;
int step1 = 4 * (NC - 16 * nc1);
int volatile m = mc;
float *volatile c_ptr, *volatile C_ptr;
float *C0, *c0;
c_ptr = c;
C_ptr = C;
if (nc1 > 0) {
asm volatile(
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"loop_mc_%=: \n\t"
"mov r6, %[C_ptr] \n\t"
"mov r5, %[nc1] \n\t"
"subs r5, r5, #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c_ptr]]! \n\t"
"vst1.32 {q0, q1}, [r6]! \n\t"
"vld1.32 {q2, q3}, [%[c_ptr]]! \n\t"
"vst1.32 {q2, q3}, [r6]! \n\t"
"subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"add %[C_ptr], %[C_ptr], %[step] \n\t"
"add %[c_ptr], %[c_ptr], %[step1] \n\t"
"subs %[mc], %[mc], #1 \n\t"
"bge loop_mc_%= \n\t"
"end_mc_%=: \n\t"
:
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
[step] "r"(step), [step1] "r"(step1)
: "memory", "r5", "r6", "q0", "q1", "q2", "q3");
}
if (_nc1 != 0) {
for (int i = 0; i < mc; i++) {
C0 = C_ptr + nc1 * 16 + i * ldc;
c0 = c_ptr + nc1 * 16 + i * NC;
for (int j = 0; j < _nc1; j++) {
*C0++ = *c0++;
}
}
}
}
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
// C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16;
int _nc1 = nc % 16;
int step = 4 * ldc;
int step1 = 4 * (NC - 16 * nc1);
int volatile m = mc;
float *volatile c_ptr, *volatile C_ptr;
float *C0, *c0;
c_ptr = c;
C_ptr = C;
if (nc1 > 0) {
asm volatile(
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"loop_mc_%=: \n\t"
"mov r6, %[C_ptr] \n\t"
"mov r5, %[nc1] \n\t"
"subs r5, r5, #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [r6] \n\t"
"vld1.32 {q2, q3}, [%[c_ptr]]! \n\t"
"vadd.f32 q10, q0, q2 \n\t"
"vadd.f32 q11, q1, q3 \n\t"
"vst1.32 {q10, q11}, [r6]! \n\t"
"vld1.32 {q4, q5}, [r6] \n\t"
"vld1.32 {q6, q7}, [%[c_ptr]]! \n\t"
"vadd.f32 q12, q4, q6 \n\t"
"vadd.f32 q13, q5, q7 \n\t"
"vst1.32 {q12, q13}, [r6]! \n\t"
"subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"add %[C_ptr], %[C_ptr], %[step] \n\t"
"add %[c_ptr], %[c_ptr], %[step1] \n\t"
"subs %[mc], %[mc], #1 \n\t"
"bge loop_mc_%= \n\t"
"end_mc_%=: \n\t"
:
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
[step] "r"(step), [step1] "r"(step1)
: "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q10", "q11", "q12", "q13");
}
if (_nc1 != 0) {
for (int i = 0; i < mc; i++) {
C0 = C_ptr + nc1 * 16 + i * ldc;
c0 = c_ptr + nc1 * 16 + i * NC;
for (int j = 0; j < _nc1; j++) {
*C0++ += *c0++;
}
}
}
}
// C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16;
int _nc1 = nc % 16;
int step = 4 * ldc;
int step1 = 4 * (NC - 16 * nc1);
int volatile m = mc;
float *volatile c_ptr, *volatile C_ptr;
float *C0, *c0;
c_ptr = c;
C_ptr = C;
if (nc1 > 0) {
asm volatile(
"vmov.f32 q14, #0.0 \n\t"
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"loop_mc_%=: \n\t"
"mov r6, %[C_ptr] \n\t"
"mov r5, %[nc1] \n\t"
"subs r5, r5, #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [r6] \n\t"
"vld1.32 {q2, q3}, [%[c_ptr]]! \n\t"
"vadd.f32 q10, q0, q2 \n\t"
"vadd.f32 q11, q1, q3 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vmax.f32 q11, q11, q14 \n\t"
"vst1.32 {q10, q11}, [r6]! \n\t"
"vld1.32 {q4, q5}, [r6] \n\t"
"vld1.32 {q6, q7}, [%[c_ptr]]! \n\t"
"vadd.f32 q12, q4, q6 \n\t"
"vadd.f32 q13, q5, q7 \n\t"
"vmax.f32 q12, q12, q14 \n\t"
"vmax.f32 q13, q13, q14 \n\t"
"vst1.32 {q12, q13}, [r6]! \n\t"
"subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"add %[C_ptr], %[C_ptr], %[step] \n\t"
"add %[c_ptr], %[c_ptr], %[step1] \n\t"
"subs %[mc], %[mc], #1 \n\t"
"bge loop_mc_%= \n\t"
"end_mc_%=: \n\t"
:
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
[step] "r"(step), [step1] "r"(step1)
: "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q10", "q11", "q12", "q13");
}
if (_nc1 != 0) {
for (int i = 0; i < mc; i++) {
C0 = C_ptr + nc1 * 16 + i * ldc;
c0 = c_ptr + nc1 * 16 + i * NC;
for (int j = 0; j < _nc1; j++) {
*C0 += *c0;
if (*C0 < 0) {
*C0 = 0;
}
C0++;
c0++;
}
}
}
}
// C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
float *bias) {
int nc1 = nc / 16;
int _nc1 = nc % 16;
int nc2 = _nc1 / 4;
int nc3 = 16 - 4 * (_nc1 % 4);
int step = 4 * (ldc - nc);
int step1 = 4 * (NC - nc);
asm volatile(
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"loop_mc_%=: \n\t"
"mov r5, %[nc1] \n\t"
"mov r6, %[nc2] \n\t"
"subs r5, r5, #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c]]! \n\t"
"vld1.32 {q2, q3}, [%[scale]]! \n\t"
"vld1.32 {q10, q11}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q2 \n\t"
"vmla.f32 q11, q1, q3 \n\t"
"vst1.32 {q10, q11}, [%[C]]! \n\t"
"vld1.32 {q4, q5}, [%[c]]! \n\t"
"vld1.32 {q6, q7}, [%[scale]]! \n\t"
"vld1.32 {q12, q13}, [%[bias]]! \n\t"
"vmla.f32 q12, q4, q6 \n\t"
"vmla.f32 q13, q5, q7 \n\t"
"vst1.32 {q12, q13}, [%[C]]! \n\t"
"subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"subs r6, r6, #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"subs r6, r6, #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
"cmp %[nc3], #16 \n\t"
"beq end_nc3_%= \n\t"
"sub %[c], %[c], %[nc3] \n\t"
"sub %[scale], %[scale], %[nc3] \n\t"
"sub %[bias], %[bias], %[nc3] \n\t"
"sub %[C], %[C], %[nc3] \n\t"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"end_nc3_%=: \n\t"
"add %[c], %[c], %[step1] \n\t"
"add %[scale], %[scale], %[step] \n\t"
"add %[bias], %[bias], %[step] \n\t"
"add %[C], %[C], %[step] \n\t"
"subs %[mc], %[mc], #1 \n\t"
"bge loop_mc_%= \n\t"
"end_mc_%=: \n\t"
:
: [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
[nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
[scale] "r"(scale), [bias] "r"(bias)
: "memory", "cc", "r5", "r6", "r7", "r8", "q0", "q1", "q2", "q3", "q4",
"q5", "q6", "q7", "q10", "q11", "q12", "q13");
}
// C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
float *bias) {
int nc1 = nc / 16;
int _nc1 = nc % 16;
int nc2 = _nc1 / 4;
int nc3 = 16 - 4 * (_nc1 % 4);
int step = 4 * (ldc - nc);
int step1 = 4 * (NC - nc);
asm volatile(
"vmov.f32 q14, #0.0 \n\t"
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"loop_mc_%=: \n\t"
"mov r5, %[nc1] \n\t"
"mov r6, %[nc2] \n\t"
"subs r5, r5, #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c]]! \n\t"
"vld1.32 {q2, q3}, [%[scale]]! \n\t"
"vld1.32 {q10, q11}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q2 \n\t"
"vmla.f32 q11, q1, q3 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vmax.f32 q11, q11, q14 \n\t"
"vst1.32 {q10, q11}, [%[C]]! \n\t"
"vld1.32 {q4, q5}, [%[c]]! \n\t"
"vld1.32 {q6, q7}, [%[scale]]! \n\t"
"vld1.32 {q12, q13}, [%[bias]]! \n\t"
"vmla.f32 q12, q4, q6 \n\t"
"vmla.f32 q13, q5, q7 \n\t"
"vmax.f32 q12, q12, q14 \n\t"
"vmax.f32 q13, q13, q14 \n\t"
"vst1.32 {q12, q13}, [%[C]]! \n\t"
"subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"subs r6, r6, #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"subs r6, r6, #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
"cmp %[nc3], #16 \n\t"
"beq end_nc3_%= \n\t"
"sub %[c], %[c], %[nc3] \n\t"
"sub %[scale], %[scale], %[nc3] \n\t"
"sub %[bias], %[bias], %[nc3] \n\t"
"sub %[C], %[C], %[nc3] \n\t"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"end_nc3_%=: \n\t"
"add %[c], %[c], %[step1] \n\t"
"add %[scale], %[scale], %[step] \n\t"
"add %[bias], %[bias], %[step] \n\t"
"add %[C], %[C], %[step] \n\t"
"subs %[mc], %[mc], #1 \n\t"
"bge loop_mc_%= \n\t"
"end_mc_%=: \n\t"
:
: [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
[nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
[scale] "r"(scale), [bias] "r"(bias)
: "memory", "r5", "r6", "r7", "r8", "q0", "q1", "q2", "q3", "q4", "q5",
"q6", "q7", "q10", "q11", "q12", "q13", "q14");
}
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc) {
int nc1 = n / 16;
int _nc1 = n % 16;
int nc2 = _nc1 / 4;
int nc3 = 16 - 4 * (_nc1 % 4);
asm volatile(
"subs %[nc1], %[nc1], #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c]]! \n\t"
"vst1.32 {q0, q1}, [%[C]]! \n\t"
"vld1.32 {q2, q3}, [%[c]]! \n\t"
"vst1.32 {q2, q3}, [%[C]]! \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"vld1.32 {q4}, [%[c]]! \n\t"
"vst1.32 {q4}, [%[C]]! \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
"cmp %[nc3], #16 \n\t"
"beq end_nc3_%= \n\t"
"sub %[c], %[c], %[nc3] \n\t"
"sub %[C], %[C], %[nc3] \n\t"
"vld1.32 {q5}, [%[c]]! \n\t"
"vst1.32 {q5}, [%[C]]! \n\t"
"end_nc3_%=: \n\t"
:
: [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5");
}
// C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
// C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
int nc1 = n / 16;
int _nc1 = n % 16;
asm volatile(
"subs %[nc1], %[nc1], #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c]]! \n\t"
"vld1.32 {q2, q3}, [%[C]] \n\t"
"vadd.f32 q10, q0, q2 \n\t"
"vadd.f32 q11, q1, q3 \n\t"
"vst1.32 {q10, q11}, [%[C]]! \n\t"
"vld1.32 {q4, q5}, [%[c]]! \n\t"
"vld1.32 {q6, q7}, [%[C]] \n\t"
"vadd.f32 q12, q4, q6 \n\t"
"vadd.f32 q13, q5, q7 \n\t"
"vst1.32 {q12, q13}, [%[C]]! \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
: [C] "+r"(C), [c] "+r"(c)
: [nc1] "r"(nc1)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
"q12", "q13");
if (_nc1 != 0) {
for (int j = 0; j < _nc1; j++) {
*C++ += *c++;
}
}
}
// C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
int nc1 = n / 16;
int _nc1 = n % 16;
asm volatile(
"vmov.f32 q14, #0.0 \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c]]! \n\t"
"vld1.32 {q2, q3}, [%[C]] \n\t"
"vadd.f32 q10, q0, q2 \n\t"
"vadd.f32 q11, q1, q3 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vmax.f32 q11, q11, q14 \n\t"
"vst1.32 {q10, q11}, [%[C]]! \n\t"
"vld1.32 {q4, q5}, [%[c]]! \n\t"
"vld1.32 {q6, q7}, [%[C]] \n\t"
"vadd.f32 q12, q4, q6 \n\t"
"vadd.f32 q13, q5, q7 \n\t"
"vmax.f32 q12, q12, q14 \n\t"
"vmax.f32 q13, q13, q14 \n\t"
"vst1.32 {q12, q13}, [%[C]]! \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
: [C] "+r"(C), [c] "+r"(c)
: [nc1] "r"(nc1)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
"q12", "q13");
if (_nc1 != 0) {
for (int j = 0; j < _nc1; j++) {
*C += *c;
if (*C < 0) {
*C = 0;
}
C++;
c++;
}
}
}
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
float *bias) {
int nc1 = n / 16;
int _nc1 = n % 16;
int nc2 = _nc1 / 4;
int nc3 = 16 - 4 * (_nc1 % 4);
asm volatile(
"subs %[nc1], %[nc1], #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c]]! \n\t"
"vld1.32 {q2, q3}, [%[scale]]! \n\t"
"vld1.32 {q10, q11}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q2 \n\t"
"vmla.f32 q11, q1, q3 \n\t"
"vst1.32 {q10, q11}, [%[C]]! \n\t"
"vld1.32 {q4, q5}, [%[c]]! \n\t"
"vld1.32 {q6, q7}, [%[scale]]! \n\t"
"vld1.32 {q12, q13}, [%[bias]]! \n\t"
"vmla.f32 q12, q4, q6 \n\t"
"vmla.f32 q13, q5, q7 \n\t"
"vst1.32 {q12, q13}, [%[C]]! \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
"cmp %[nc3], #16 \n\t"
"beq end_nc3_%= \n\t"
"sub %[c], %[c], %[nc3] \n\t"
"sub %[scale], %[scale], %[nc3] \n\t"
"sub %[bias], %[bias], %[nc3] \n\t"
"sub %[C], %[C], %[nc3] \n\t"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"end_nc3_%=: \n\t"
:
: [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
[scale] "r"(scale), [bias] "r"(bias)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
"q12", "q13");
}
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
float *bias) {
int nc1 = n / 16;
int _nc1 = n % 16;
int nc2 = _nc1 / 4;
int nc3 = 16 - 4 * (_nc1 % 4);
asm volatile(
"vmov.f32 q14, #0.0 \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c]]! \n\t"
"vld1.32 {q2, q3}, [%[scale]]! \n\t"
"vld1.32 {q10, q11}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q2 \n\t"
"vmla.f32 q11, q1, q3 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vmax.f32 q11, q11, q14 \n\t"
"vst1.32 {q10, q11}, [%[C]]! \n\t"
"vld1.32 {q4, q5}, [%[c]]! \n\t"
"vld1.32 {q6, q7}, [%[scale]]! \n\t"
"vld1.32 {q12, q13}, [%[bias]]! \n\t"
"vmla.f32 q12, q4, q6 \n\t"
"vmla.f32 q13, q5, q7 \n\t"
"vmax.f32 q12, q12, q14 \n\t"
"vmax.f32 q13, q13, q14 \n\t"
"vst1.32 {q12, q13}, [%[C]]! \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
"cmp %[nc3], #16 \n\t"
"beq end_nc3_%= \n\t"
"sub %[c], %[c], %[nc3] \n\t"
"sub %[scale], %[scale], %[nc3] \n\t"
"sub %[bias], %[bias], %[nc3] \n\t"
"sub %[C], %[C], %[nc3] \n\t"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"end_nc3_%=: \n\t"
:
: [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
[scale] "r"(scale), [bias] "r"(bias)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
"q12", "q13", "q14");
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
} // namespace paddle_mobile
...@@ -19,12 +19,8 @@ limitations under the License. */ ...@@ -19,12 +19,8 @@ limitations under the License. */
#define B(i, j) B[(i)*ldb + (j)] #define B(i, j) B[(i)*ldb + (j)]
#define C(i, j) C[(i)*ldc + (j)] #define C(i, j) C[(i)*ldc + (j)]
// 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k
#define MC 128
#define KC 128
#define NC 1024
#define MR 4 #define MR 4
#define NR 4 #define NR 8
#define s_min(i, j) ((i) < (j) ? (i) : (j)) #define s_min(i, j) ((i) < (j) ? (i) : (j))
...@@ -49,28 +45,66 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, ...@@ -49,28 +45,66 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda, void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
const float *B, int ldb, float beta, float *C, int ldc, float beta, float *c, float *C, int ldc, bool relu);
int first_time);
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
// 向量矩阵乘法 (M = 1) // 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc); const float *B, int ldb, float beta, float *C, int ldc,
// 计算一个更小的 4 * 4 的 C 矩阵分块
void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
int ldb, float beta, float *C, int ldc, int mc, int nc);
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc,
bool relu); bool relu);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias);
// 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
// 分块矩阵乘法结果回写
// C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias);
// 向量矩阵乘法结果回写
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc); const float *B, int ldb, float beta, float *C, int ldc, bool relu);
void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda, // 32位 float 矩阵乘法, 并对结果进行 batchnrom
const float *B, int ldb, float beta, float *C, int ldc); void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
// 64位 double 矩阵乘法 // 64位 double 矩阵乘法
void dgemm(int m, int n, int k, float alpha, const double *A, int lda, void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
......
...@@ -39,22 +39,18 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -39,22 +39,18 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
if (relu) { Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K, beta, matrix_out->data<float>(), N, relu);
matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
} else {
sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
beta, matrix_out->data<float>(), N);
}
} }
template <> template <>
void matmul<double>(const framework::Tensor &matrix_a, bool trans_a, void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
double alpha, framework::Tensor *matrix_out, double beta, float alpha, framework::Tensor *matrix_out, float beta,
bool relu) { bool relu, framework::Tensor *new_scale,
framework::Tensor *new_bias) {
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -71,7 +67,11 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -71,7 +67,11 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
N, beta, matrix_out->data<float>(), N, relu,
new_scale->data<float>(), new_bias->data<float>());
} }
} // namespace math } // namespace math
......
...@@ -26,6 +26,12 @@ template <typename T> ...@@ -26,6 +26,12 @@ template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha, const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false); framework::Tensor *matrix_out, T beta, bool relu = false);
template <typename T>
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu,
framework::Tensor *new_scale, framework::Tensor *new_bias);
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册