提交 11b3a655 编写于 作者: Z zhaojiaying01

fix android cross-compilation on armv8 platform

上级 098be2ab
......@@ -33,6 +33,7 @@ float *packedA;
float *packedB;
float *packedC;
float *zero;
/*
// 将A矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer) {
......@@ -60,6 +61,36 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
}
}
// 将B矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) {
int i, j;
const float *Bj, *Bj1, *Bj2, *Bj3;
for (j = 0; j < n - n_tail; j += NR) {
Bj = &B(0, j);
Bj1 = &B(0, j + 1);
Bj2 = &B(0, j + 2);
Bj3 = &B(0, j + 3);
for (i = 0; i < k; ++i) {
*buffer++ = *Bj++;
*buffer++ = *Bj1++;
*buffer++ = *Bj2++;
*buffer++ = *Bj3++;
}
}
if (n_tail != 0) {
for (i = 0; i < k; ++i) {
for (int j = n - n_tail; j < n; ++j) {
*buffer++ = B(i, j);
}
for (int j = n; j < n + (NR - n_tail); ++j) {
*buffer++ = 0;
}
}
}
}
*/
// 将A矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer) {
......@@ -100,35 +131,6 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
}
}
// 将B矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) {
int i, j;
const float *Bj, *Bj1, *Bj2, *Bj3;
for (j = 0; j < n - n_tail; j += NR) {
Bj = &B(0, j);
Bj1 = &B(0, j + 1);
Bj2 = &B(0, j + 2);
Bj3 = &B(0, j + 3);
for (i = 0; i < k; ++i) {
*buffer++ = *Bj++;
*buffer++ = *Bj1++;
*buffer++ = *Bj2++;
*buffer++ = *Bj3++;
}
}
if (n_tail != 0) {
for (i = 0; i < k; ++i) {
for (int j = n - n_tail; j < n; ++j) {
*buffer++ = B(i, j);
}
for (int j = n; j < n + (NR - n_tail); ++j) {
*buffer++ = 0;
}
}
}
}
// 将B矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) {
......@@ -138,18 +140,31 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
b0 = &B(i, j);
#if __ARM_NEON
#if __aarch64__
asm volatile(
"prfm pldl1keep, [%[b0]] \n\t"
"ld1 {v0.4s, v1.4s}, [%[b0]] \n\t"
"st1 {v0.4s, v1.4s}, [%[buffer]], #32 \n\t"
: [buffer] "+r"(buffer)
: [b0] "r"(b0)
: "memory", "v0", "v1");
#else
asm volatile(
"pld [%[b0]] \n\t"
"vld1.32 {q0, q1}, [%[b0]] \n\t"
"vst1.32 {q0, q1}, [%[buffer]]! \n\t"
"pld [%[b0]] \n\t"
"vld1.32 {q0, q1}, [%[b0]] \n\t"
"vst1.32 {q0, q1}, [%[buffer]]! \n\t"
: [buffer] "+r"(buffer)
: [b0] "r"(b0)
: "memory", "q0", "q0");
: "memory", "q0", "q1");
#endif // __aarch64__
#else
*buffer++ = *b0++;
*buffer++ = *b0++;
*buffer++ = *b0++;
*buffer++ = *b0++;
*buffer++ = *b0++;
*buffer++ = *b0++;
*buffer++ = *b0++;
*buffer++ = *b0++;
#endif // __ARM_NEON
}
}
......@@ -217,7 +232,7 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
#if __ARM_NEON
#if __aarch64__
void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
// init C
float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0);
......@@ -244,23 +259,264 @@ void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
a += MR;
b += NR;
}
float32x4x4_t cv = {cv0, cv1, cv2, cv3};
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
C(i, j) = 0.0;
} else if (beta != 1.0) {
C(i, j) *= beta;
vst1q_f32(c, cv0);
vst1q_f32(c + ldc, cv1);
vst1q_f32(c + 2 * ldc, cv2);
vst1q_f32(c + 3 * ldc, cv3);
// float32x4x4_t cv = {cv0, cv1, cv2, cv3};
}
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
// init C
float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0);
float32x4_t cv2 = vdupq_n_f32(0.0);
float32x4_t cv3 = vdupq_n_f32(0.0);
float32x4_t cv4 = vdupq_n_f32(0.0);
float32x4_t cv5 = vdupq_n_f32(0.0);
float32x4_t cv6 = vdupq_n_f32(0.0);
float32x4_t cv7 = vdupq_n_f32(0.0);
float32x4_t av;
float32x4_t bv0;
float32x4_t bv1;
float32x2_t av01;
float32x2_t av23;
for (int p = 0; p < k; p += 1) {
av = vld1q_f32(a);
bv0 = vld1q_f32(b);
bv1 = vld1q_f32(b + 4);
av01 = vget_low_f32(av);
cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
av23 = vget_high_f32(av);
cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
a += MR;
b += NR;
}
vst1q_f32(c, cv0);
vst1q_f32(c + 4, cv1);
vst1q_f32(c + ldc, cv2);
vst1q_f32(c + ldc + 4, cv3);
vst1q_f32(c + 2 * ldc, cv4);
vst1q_f32(c + 2 * ldc + 4, cv5);
vst1q_f32(c + 3 * ldc, cv6);
vst1q_f32(c + 3 * ldc + 4, cv7);
}
// 分块矩阵乘法结果回写
// C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 4;
int _nc1 = nc % 4;
float *c_ptr, *C_ptr;
float32x4_t cv;
for (int i = 0; i < mc; ++i) {
c_ptr = c + i * NC;
C_ptr = C + i * ldc;
for (int j = 0; j < nc1; ++j) {
cv = vld1q_f32(c_ptr);
vst1q_f32(C_ptr, cv);
c_ptr += 4;
C_ptr += 4;
}
if (_nc1 != 0) {
cv = vld1q_f32(c_ptr);
if (_nc1 >= 1) {
vst1q_lane_f32(C_ptr, cv, 0);
C_ptr++;
}
if (_nc1 >= 2) {
vst1q_lane_f32(C_ptr, cv, 1);
C_ptr++;
}
if (_nc1 >= 3) {
vst1q_lane_f32(C_ptr, cv, 2);
}
}
}
}
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
// C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 4;
int _nc1 = nc % 4;
float *c_ptr, *C_ptr;
float32x4_t cv;
float32x4_t cv1;
for (int i = 0; i < mc; ++i) {
c_ptr = c + i * NC;
C_ptr = C + i * ldc;
for (int j = 0; j < nc1; ++j) {
cv = vld1q_f32(c_ptr);
cv1 = vld1q_f32(C_ptr);
cv = vaddq_f32(cv, cv1);
vst1q_f32(C_ptr, cv);
c_ptr += 4;
C_ptr += 4;
}
if (_nc1 != 0) {
cv = vld1q_f32(c_ptr);
cv1 = vld1q_f32(C_ptr);
cv = vaddq_f32(cv, cv1);
if (_nc1 >= 1) {
vst1q_lane_f32(C_ptr, cv, 0);
C_ptr++;
}
if (_nc1 >= 2) {
vst1q_lane_f32(C_ptr, cv, 1);
C_ptr++;
}
if (_nc1 >= 3) {
vst1q_lane_f32(C_ptr, cv, 2);
}
}
}
}
// C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 4;
int _nc1 = nc % 4;
float *c_ptr, *C_ptr;
float32x4_t cv;
float32x4_t cv1;
float32x4_t zero = vdupq_n_f32(0.0);
for (int i = 0; i < mc; ++i) {
c_ptr = c + i * NC;
C_ptr = C + i * ldc;
for (int j = 0; j < nc1; ++j) {
cv = vld1q_f32(c_ptr);
cv1 = vld1q_f32(C_ptr);
cv = vaddq_f32(cv, cv1);
cv = vmaxq_f32(cv, zero);
vst1q_f32(C_ptr, cv);
c_ptr += 4;
C_ptr += 4;
}
if (_nc1 != 0) {
cv = vld1q_f32(c_ptr);
cv1 = vld1q_f32(C_ptr);
cv = vaddq_f32(cv, cv1);
cv = vmaxq_f32(cv, zero);
if (_nc1 >= 1) {
vst1q_lane_f32(C_ptr, cv, 0);
C_ptr++;
}
if (_nc1 >= 2) {
vst1q_lane_f32(C_ptr, cv, 1);
C_ptr++;
}
if (_nc1 >= 3) {
vst1q_lane_f32(C_ptr, cv, 2);
}
}
}
}
// C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
float *new_bias) {
int nc1 = nc / 4;
int _nc1 = nc % 4;
float *c_ptr, *C_ptr;
float32x4_t cv;
float32x4_t cv1;
float32x4_t bias;
float32x2_t scale;
for (int i = 0; i < mc; ++i) {
c_ptr = c + i * NC;
C_ptr = C + i * ldc;
bias = vld1q_dup_f32(new_bias);
scale = vld1_dup_f32(new_scale);
new_bias++;
new_scale++;
float scale0 = vget_lane_f32(scale, 0);
for (int j = 0; j < nc1; ++j) {
cv = vld1q_f32(c_ptr);
cv = vmlaq_n_f32(bias, cv, scale0);
vst1q_f32(C_ptr, cv);
c_ptr += 4;
C_ptr += 4;
}
if (_nc1 != 0) {
cv = vld1q_f32(c_ptr);
cv = vmlaq_n_f32(bias, cv, scale0);
if (_nc1 >= 1) {
vst1q_lane_f32(C_ptr, cv, 0);
C_ptr++;
}
if (_nc1 >= 2) {
vst1q_lane_f32(C_ptr, cv, 1);
C_ptr++;
}
if (_nc1 >= 3) {
vst1q_lane_f32(C_ptr, cv, 2);
C_ptr++;
}
}
}
}
// C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias) {
int nc1 = nc / 4;
int _nc1 = nc % 4;
float *c_ptr, *C_ptr;
float32x4_t cv;
float32x4_t bias;
float32x2_t scale;
float32x4_t zero = vdupq_n_f32(0.0);
for (int i = 0; i < mc; ++i) {
c_ptr = c + i * NC;
C_ptr = C + i * ldc;
bias = vld1q_dup_f32(new_bias);
scale = vld1_dup_f32(new_scale);
new_bias++;
new_scale++;
float scale0 = vget_lane_f32(scale, 0);
for (int j = 0; j < nc1; ++j) {
cv = vld1q_f32(c_ptr);
cv = vmlaq_n_f32(bias, cv, scale0);
cv = vmaxq_f32(cv, zero);
vst1q_f32(C_ptr, cv);
c_ptr += 4;
C_ptr += 4;
}
if (_nc1 != 0) {
cv = vld1q_f32(c_ptr);
cv = vmlaq_n_f32(bias, cv, scale0);
cv = vmaxq_f32(cv, zero);
if (_nc1 >= 1) {
vst1q_lane_f32(C_ptr, cv, 0);
C_ptr++;
}
if (_nc1 >= 2) {
vst1q_lane_f32(C_ptr, cv, 1);
C_ptr++;
}
if (j == 0) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
} else if (j == 1) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
} else if (j == 2) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
} else if (j == 3) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
if (_nc1 >= 3) {
vst1q_lane_f32(C_ptr, cv, 2);
}
}
}
......@@ -338,6 +594,7 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
"q10", "q11", "q12", "q13");
}
/*
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu) {
......@@ -770,6 +1027,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias);
}
}
*/
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
const float *a_ptr, *b_ptr;
......@@ -1288,6 +1546,7 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
"q8", "q10", "q11", "q12", "q13", "q14");
}
/*
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc) {
int nc1 = n / 16;
......@@ -1563,6 +1822,7 @@ void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
"q12", "q13", "q14");
}
*/
#endif // __aarch64__
#else
......
......@@ -28,6 +28,7 @@ namespace paddle_mobile {
namespace operators {
namespace math {
/*
// 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
......@@ -35,6 +36,7 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
// 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
*/
// 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
......@@ -51,7 +53,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
/*
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
......@@ -60,6 +62,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias);
*/
// 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
......@@ -81,6 +84,7 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias);
/*
// 向量矩阵乘法结果回写
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc);
......@@ -96,6 +100,7 @@ void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
*/
// 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册