提交 272d1ea1 编写于 作者: L liuruilong

synchronizing code

上级 309422be
...@@ -473,7 +473,7 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b, ...@@ -473,7 +473,7 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc, int ldb, float beta, float *C, int ldc, int mc, int nc,
bool relu = false) { bool relu = false) {
int kc1 = k / 2, kc2 = k % 2; int kc1 = k / 4, kc2 = k % 4;
int bytes_ldc = 4 * ldc; int bytes_ldc = 4 * ldc;
int flag_alpha = (alpha == 1.0) ? 1 : 2; int flag_alpha = (alpha == 1.0) ? 1 : 2;
int flag_beta; int flag_beta;
...@@ -484,16 +484,29 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, ...@@ -484,16 +484,29 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
} else { } else {
flag_beta = 2; flag_beta = 2;
} }
asm volatile( asm volatile(
"pld [%[a]] \n\t"
"pld [%[b]] \n\t"
"vmov.f32 q10, #0.0 \n\t" "vmov.f32 q10, #0.0 \n\t"
"vmov.f32 q11, #0.0 \n\t" "vmov.f32 q11, #0.0 \n\t"
"vmov.f32 q12, #0.0 \n\t" "vmov.f32 q12, #0.0 \n\t"
"vmov.f32 q13, #0.0 \n\t" "vmov.f32 q13, #0.0 \n\t"
"vmov.f32 q14, #0.0 \n\t"
"subs %[kc1], %[kc1], #1 \n\t" "subs %[kc1], %[kc1], #1 \n\t"
"blt end_kc1_%= \n\t" "blt end_kc1_%= \n\t"
"loop_kc1_%=: \n\t" "loop_kc1_%=: \n\t"
"pld [%[a], #64] \n\t"
"pld [%[b], #64] \n\t"
"vld1.32 {q0, q1}, [%[a]]! \n\t"
"vld1.32 {q2, q3}, [%[b]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q2, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q2, d1[1] \n\t"
"vmla.f32 q10, q3, d2[0] \n\t"
"vmla.f32 q11, q3, d2[1] \n\t"
"vmla.f32 q12, q3, d3[0] \n\t"
"vmla.f32 q13, q3, d3[1] \n\t"
"vld1.32 {q0, q1}, [%[a]]! \n\t" "vld1.32 {q0, q1}, [%[a]]! \n\t"
"vld1.32 {q2, q3}, [%[b]]! \n\t" "vld1.32 {q2, q3}, [%[b]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t" "vmla.f32 q10, q2, d0[0] \n\t"
...@@ -536,7 +549,7 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, ...@@ -536,7 +549,7 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"vmul.f32 q13, q13, d8[0] \n\t" "vmul.f32 q13, q13, d8[0] \n\t"
"beta_%=: \n\t" "beta_%=: \n\t"
"cmp %[flag_beta], #0 \n\t" "cmp %[flag_beta], #0 \n\t"
"beq memory_%= \n\t" "beq memory_%= \n\t"
"mov r4, %[C] \n\t" "mov r4, %[C] \n\t"
...@@ -545,7 +558,7 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, ...@@ -545,7 +558,7 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"vld1.32 {q1}, [r4], r6 \n\t" "vld1.32 {q1}, [r4], r6 \n\t"
"vld1.32 {q2}, [r4], r6 \n\t" "vld1.32 {q2}, [r4], r6 \n\t"
"vld1.32 {q3}, [r4] \n\t" "vld1.32 {q3}, [r4] \n\t"
"cmp %[flag_beta], #1 \n\t" "cmp %[flag_beta], #1 \n\t"
"beq beta_eq1_%= \n\t" "beq beta_eq1_%= \n\t"
"bne beta_ne1_%= \n\t" "bne beta_ne1_%= \n\t"
...@@ -569,7 +582,6 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, ...@@ -569,7 +582,6 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"vmax.f32 q13, q13, q14 \n\t" "vmax.f32 q13, q13, q14 \n\t"
"mov r5, %[C] \n\t" "mov r5, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t" "mov r6, %[bytes_ldc]\n\t"
"vst1.32 {q10}, [r5], r6 \n\t" "vst1.32 {q10}, [r5], r6 \n\t"
"vst1.32 {q11}, [r5], r6 \n\t" "vst1.32 {q11}, [r5], r6 \n\t"
"vst1.32 {q12}, [r5], r6 \n\t" "vst1.32 {q12}, [r5], r6 \n\t"
...@@ -585,8 +597,7 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b, ...@@ -585,8 +597,7 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
[kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha), [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
[beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc), [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
[flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta) [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
: "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13", : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
"q14");
if (mc != MR || nc != NR) { if (mc != MR || nc != NR) {
int i, j; int i, j;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册