提交 adf5c9da 编写于 作者: xiebaiyuan's avatar xiebaiyuan

trans gemm to class && add multi instance support && to unit test

上级 266635bb
...@@ -26,7 +26,7 @@ limitations under the License. */ ...@@ -26,7 +26,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
int MC = 0; /*int MC = 0;
int KC = 0; int KC = 0;
int NC = 0; int NC = 0;
...@@ -40,7 +40,7 @@ typedef void (*FnAddDot)(int, const float *, const float *, float *, int); ...@@ -40,7 +40,7 @@ typedef void (*FnAddDot)(int, const float *, const float *, float *, int);
FnPack procPackA; FnPack procPackA;
FnPack procPackB; FnPack procPackB;
FnAddDot procAddDot; FnAddDot procAddDot;*/
/* /*
// 将A矩阵分块复制到连续内存(ColMajor) // 将A矩阵分块复制到连续内存(ColMajor)
...@@ -101,7 +101,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -101,7 +101,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
*/ */
// 将A矩阵分块复制到连续内存(RowMajor) // 将A矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda, void Gemm::PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const float *a0, *a1, *a2, *a3; const float *a0, *a1, *a2, *a3;
for (int i = 0; i < m - m_tail; i += MR) { for (int i = 0; i < m - m_tail; i += MR) {
...@@ -142,7 +142,7 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda, ...@@ -142,7 +142,7 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
} }
} }
void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, void Gemm::PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const int i_length = m - m_tail; const int i_length = m - m_tail;
for (int i = 0; i < i_length; i += MR) { for (int i = 0; i < i_length; i += MR) {
...@@ -196,7 +196,7 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, ...@@ -196,7 +196,7 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
} }
} }
void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda, void Gemm::PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const int i_length = m - m_tail; const int i_length = m - m_tail;
#pragma omp parallel for #pragma omp parallel for
...@@ -251,7 +251,7 @@ void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda, ...@@ -251,7 +251,7 @@ void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
} }
} }
void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda, void Gemm::PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const int i_length = m - m_tail; const int i_length = m - m_tail;
for (int i = 0; i < i_length; i += MR) { for (int i = 0; i < i_length; i += MR) {
...@@ -317,7 +317,7 @@ void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda, ...@@ -317,7 +317,7 @@ void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
} }
} }
void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda, void Gemm::PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const int i_length = m - m_tail; const int i_length = m - m_tail;
#pragma omp parallel for #pragma omp parallel for
...@@ -385,7 +385,7 @@ void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda, ...@@ -385,7 +385,7 @@ void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
} }
// 将B矩阵分块复制到连续内存(RowMajor) // 将B矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
for (int j = 0; j < j_length; j += NR) { for (int j = 0; j < j_length; j += NR) {
...@@ -436,7 +436,7 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -436,7 +436,7 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
} }
} }
void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
#pragma omp parallel for #pragma omp parallel for
...@@ -489,7 +489,7 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -489,7 +489,7 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
} }
#if __aarch64__ #if __aarch64__
void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
for (int j = 0; j < j_length; j += NR) { for (int j = 0; j < j_length; j += NR) {
...@@ -519,8 +519,8 @@ void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -519,8 +519,8 @@ void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
} }
} }
void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B,
float *buffer) { int ldb, float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < j_length; j += NR) { for (int j = 0; j < j_length; j += NR) {
...@@ -550,7 +550,7 @@ void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -550,7 +550,7 @@ void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
} }
} }
void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
for (int j = 0; j < n - n_tail; j += NR) { for (int j = 0; j < n - n_tail; j += NR) {
...@@ -580,8 +580,8 @@ void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -580,8 +580,8 @@ void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
} }
} }
void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B,
float *buffer) { int ldb, float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < n - n_tail; j += NR) { for (int j = 0; j < n - n_tail; j += NR) {
...@@ -613,8 +613,9 @@ void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -613,8 +613,9 @@ void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
#endif // __aarch64__ #endif // __aarch64__
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void Gemm::InnerKernel(int mc, int nc, float alpha, const float *a,
float beta, float *c, float *C, int ldc, bool relu) { const float *b, float beta, float *c, float *C, int ldc,
bool relu) {
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
...@@ -648,7 +649,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, ...@@ -648,7 +649,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernelWithBias(int mc, int nc, float alpha, const float *a, void Gemm::InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *bias) { int ldc, bool relu, float *bias) {
#pragma omp parallel for #pragma omp parallel for
...@@ -692,9 +693,10 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a, ...@@ -692,9 +693,10 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, void Gemm::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc, const float *b, float beta, float *c, float *C,
bool relu, float *new_scale, float *new_bias) { int ldc, bool relu, float *new_scale,
float *new_bias) {
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
...@@ -717,10 +719,10 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, ...@@ -717,10 +719,10 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a, void Gemm::InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *new_scale, float *new_bias, int ldc, bool relu, float *new_scale,
float *bias) { float *new_bias, float *bias) {
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
...@@ -737,7 +739,7 @@ void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a, ...@@ -737,7 +739,7 @@ void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias); WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias);
} }
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, void Gemm::InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
float *c, float *C, int ldc, float *p, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1) { std::string mode, float *bias, float *bias1) {
#pragma omp parallel for #pragma omp parallel for
...@@ -759,7 +761,7 @@ void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, ...@@ -759,7 +761,7 @@ void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__ #if __aarch64__
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
// init C // init C
float32x4_t cv0 = vdupq_n_f32(0.0); float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0); float32x4_t cv1 = vdupq_n_f32(0.0);
...@@ -794,7 +796,7 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -794,7 +796,7 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
// float32x4x4_t cv = {cv0, cv1, cv2, cv3}; // float32x4x4_t cv = {cv0, cv1, cv2, cv3};
} }
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
// init C // init C
float32x4_t cv0 = vdupq_n_f32(0.0); float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0); float32x4_t cv1 = vdupq_n_f32(0.0);
...@@ -844,7 +846,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -844,7 +846,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
// 分块矩阵乘法结果回写 // 分块矩阵乘法结果回写
// C = A * B // C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -877,10 +879,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { ...@@ -877,10 +879,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -917,7 +919,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { ...@@ -917,7 +919,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
} }
} }
// C = A * B + bias // C = A * B + bias
void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) { void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -955,7 +958,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) { ...@@ -955,7 +958,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -996,7 +999,7 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { ...@@ -996,7 +999,7 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B + bias, relu(C) // C = A * B + bias, relu(C)
void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) { float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -1038,8 +1041,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, ...@@ -1038,8 +1041,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B + C,prelu(C) // C = A * B + C,prelu(C)
void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
std::string mode, float *bias, float *bias1) { float *p, std::string mode, float *bias,
float *bias1) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -1114,8 +1118,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, ...@@ -1114,8 +1118,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
float *new_bias) { float *new_scale, float *new_bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -1159,7 +1163,7 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, ...@@ -1159,7 +1163,7 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias) { float *new_scale, float *new_bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -1205,7 +1209,7 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -1205,7 +1209,7 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B, batchnorm(C),C = C + bias; relu(C) // C = A * B, batchnorm(C),C = C + bias; relu(C)
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias) { float *new_scale, float *new_bias, float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -1259,7 +1263,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -1259,7 +1263,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
#else #else
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -1330,10 +1334,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -1330,10 +1334,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
} }
/* /*
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A, int
const float *B, int ldb, float beta, float *C, int ldc, lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu) { float
bool relu) { *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3; const float *a0, *b0, *b1, *b2, *b3;
float *c0, *C0; float *c0, *C0;
...@@ -1552,7 +1555,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -1552,7 +1555,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
} }
} }
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, void Gemm::VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C, int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias) { int ldc, bool relu, float *new_scale, float *new_bias) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n)); float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
...@@ -1764,7 +1767,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, ...@@ -1764,7 +1767,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
} }
*/ */
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -1872,7 +1875,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -1872,7 +1875,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
} }
// C = A * B // C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1929,10 +1932,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1929,10 +1932,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1996,7 +1999,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1996,7 +1999,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B + bias // C = A * B + bias
void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) { void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -2034,7 +2038,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) { ...@@ -2034,7 +2038,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -2108,7 +2112,7 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { ...@@ -2108,7 +2112,7 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B + bias, relu(C) // C = A * B + bias, relu(C)
void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) { float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -2149,8 +2153,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, ...@@ -2149,8 +2153,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
} }
} }
void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
std::string mode, float *bias, float *bias1) { float *p, std::string mode, float *bias,
float *bias1) {
if (nc < 4) { if (nc < 4) {
if (bias1 == nullptr) { if (bias1 == nullptr) {
for (int i = 0; i < mc; ++i) { for (int i = 0; i < mc; ++i) {
...@@ -2383,8 +2388,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, ...@@ -2383,8 +2388,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale, void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
float *bias) { float *scale, float *bias) {
if (nc < 4) { if (nc < 4) {
for (int i = 0; i < mc; ++i) { for (int i = 0; i < mc; ++i) {
for (int j = 0; j < nc; ++j) { for (int j = 0; j < nc; ++j) {
...@@ -2484,8 +2489,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale, ...@@ -2484,8 +2489,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale, void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *bias) { float *scale, float *bias) {
if (nc < 4) { if (nc < 4) {
for (int i = 0; i < mc; ++i) { for (int i = 0; i < mc; ++i) {
for (int j = 0; j < nc; ++j) { for (int j = 0; j < nc; ++j) {
...@@ -2595,7 +2600,7 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale, ...@@ -2595,7 +2600,7 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
} }
// C = A * B, batchnorm(C),C = C + bias; relu(C) // C = A * B, batchnorm(C),C = C + bias; relu(C)
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias) { float *new_scale, float *new_bias, float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -2649,7 +2654,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2649,7 +2654,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
/* /*
// C = A * B // C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc) { void Gemm::VecWriteBasic(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -2695,10 +2700,10 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2695,10 +2700,10 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {} void Gemm::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc) { void Gemm::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -2736,7 +2741,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2736,7 +2741,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -2784,7 +2789,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2784,7 +2789,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, void Gemm::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
float *bias) { float *bias) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -2850,12 +2855,9 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2850,12 +2855,9 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale, void Gemm::VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float
float *bias) { *scale, float *bias) { int nc1 = n / 16; int _nc1 = n % 16; int nc2 = _nc1 /
int nc1 = n / 16; 4; int nc3 = 16 - 4 * (_nc1 % 4);
int _nc1 = n % 16;
int nc2 = _nc1 / 4;
int nc3 = 16 - 4 * (_nc1 % 4);
asm volatile( asm volatile(
"vmov.f32 q14, #0.0 \n\t" "vmov.f32 q14, #0.0 \n\t"
...@@ -2926,7 +2928,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2926,7 +2928,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
#endif // __aarch64__ #endif // __aarch64__
#else #else
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
float *c0, *c1, *c2, *c3; float *c0, *c1, *c2, *c3;
c0 = c; c0 = c;
c1 = c + ldc; c1 = c + ldc;
...@@ -2962,38 +2964,42 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -2962,38 +2964,42 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
} }
} }
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {} void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
}
void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {}
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {}
void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {} void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) {}
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {}
void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) {} float *bias) {}
void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
std::string mode, float *bias, float *bias1) {} float *p, std::string mode, float *bias,
float *bias1) {}
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
float *new_bias) {} float *new_scale, float *new_bias) {}
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias) {} float *new_scale, float *new_bias) {}
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1) {} float *new_scale, float *new_bias, float *bias1) {
}
#endif // __ARM_NEON #endif // __ARM_NEON
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu, const float *B, int ldb, float beta, float *C, int ldc,
float *bias) { bool relu, float *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 32 * 1024; int L1 = 32 * 1024;
...@@ -3063,9 +3069,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3063,9 +3069,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
const float *B, int ldb, float beta, float *C, int ldc, int lda, const float *B, int ldb, float beta, float *C,
bool relu, float *new_scale, float *new_bias, float *bias) { int ldc, bool relu, float *new_scale, float *new_bias,
float *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 32 * 1024; int L1 = 32 * 1024;
...@@ -3136,7 +3143,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3136,7 +3143,7 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1) { std::string mode, float *bias, float *bias1) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
...@@ -3212,7 +3219,7 @@ void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, ...@@ -3212,7 +3219,7 @@ void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
} }
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias) { bool relu, float *bias) {
#ifdef _OPENMP #ifdef _OPENMP
...@@ -3237,18 +3244,18 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3237,18 +3244,18 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
NC = (n + NR - 1) / NR * NR; NC = (n + NR - 1) / NR * NR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_16c; procPackB = &Gemm::PackMatrixB_omp_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_8c; procPackB = &Gemm::PackMatrixB_omp_8c;
procAddDot = AddDot6x8; procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
procPackB(KC, NC, NC % NR, B, ldb, packedB); (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
} else { } else {
...@@ -3265,18 +3272,19 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3265,18 +3272,19 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
MC = (m + MR - 1) / MR * MR; MC = (m + MR - 1) / MR * MR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_omp_6r; procPackA = &Gemm::PackMatrixA_omp_6r;
procPackB = PackMatrixB_16c; procPackB = &Gemm::PackMatrixB_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_omp_6r;
procPackB = PackMatrixB_8c; procPackA = &Gemm::PackMatrixA_omp_6r;
procAddDot = AddDot6x8; procPackB = &Gemm::PackMatrixB_8c;
procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
procPackA(MC, KC, MC % MR, A, lda, packedA); (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
} }
...@@ -3298,7 +3306,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3298,7 +3306,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
float *local_A = packedA + MC * KC * local_threads; float *local_A = packedA + MC * KC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A); (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C, InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
&C(i, 0), ldc, relu, bias + i); &C(i, 0), ldc, relu, bias + i);
} }
...@@ -3315,7 +3323,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3315,7 +3323,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
nc = s_min(n - j, NC); nc = s_min(n - j, NC);
float *local_B = packedB + KC * NC * local_threads; float *local_B = packedB + KC * NC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B); (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
InnerKernelWithBias(m, nc, alpha, packedA, local_B, beta, local_C, InnerKernelWithBias(m, nc, alpha, packedA, local_B, beta, local_C,
&C(0, j), ldc, relu, bias); &C(0, j), ldc, relu, bias);
} }
...@@ -3327,10 +3335,10 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3327,10 +3335,10 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
const float *B, int ldb, float beta, float *C, int ldc, int lda, const float *B, int ldb, float beta,
bool relu, float *new_scale, float *new_bias, float *C, int ldc, bool relu, float *new_scale,
float *bias) { float *new_bias, float *bias) {
#ifdef _OPENMP #ifdef _OPENMP
int max_threads = omp_get_max_threads(); int max_threads = omp_get_max_threads();
#else #else
...@@ -3353,18 +3361,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3353,18 +3361,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
NC = (n + NR - 1) / NR * NR; NC = (n + NR - 1) / NR * NR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_16c; procPackB = &Gemm::PackMatrixB_omp_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_8c; procPackB = &Gemm::PackMatrixB_omp_8c;
procAddDot = AddDot6x8; procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
procPackB(KC, NC, NC % NR, B, ldb, packedB); (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
} else { } else {
...@@ -3381,18 +3389,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3381,18 +3389,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
MC = (m + MR - 1) / MR * MR; MC = (m + MR - 1) / MR * MR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_omp_6r; procPackA = &Gemm::PackMatrixA_omp_6r;
procPackB = PackMatrixB_16c; procPackB = &Gemm::PackMatrixB_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_omp_6r; procPackA = &Gemm::PackMatrixA_omp_6r;
procPackB = PackMatrixB_8c; procPackB = &Gemm::PackMatrixB_8c;
procAddDot = AddDot6x8; procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
procPackA(MC, KC, MC % MR, A, lda, packedA); (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
} }
...@@ -3414,7 +3422,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3414,7 +3422,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
float *local_A = packedA + MC * KC * local_threads; float *local_A = packedA + MC * KC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A); (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
if (bias == nullptr) { if (bias == nullptr) {
InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C, InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C,
&C(i, 0), ldc, relu, new_scale + i, new_bias + i); &C(i, 0), ldc, relu, new_scale + i, new_bias + i);
...@@ -3437,7 +3445,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3437,7 +3445,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
nc = s_min(n - j, NC); nc = s_min(n - j, NC);
float *local_B = packedB + KC * NC * local_threads; float *local_B = packedB + KC * NC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B); (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
if (bias == nullptr) { if (bias == nullptr) {
InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C, InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C,
&C(0, j), ldc, relu, new_scale, new_bias); &C(0, j), ldc, relu, new_scale, new_bias);
...@@ -3455,9 +3463,10 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3455,9 +3463,10 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc,
std::string mode, float *bias, float *bias1) { float *p, std::string mode, float *bias,
float *bias1) {
#ifdef _OPENMP #ifdef _OPENMP
int max_threads = omp_get_max_threads(); int max_threads = omp_get_max_threads();
#else #else
...@@ -3480,18 +3489,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, ...@@ -3480,18 +3489,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
NC = (n + NR - 1) / NR * NR; NC = (n + NR - 1) / NR * NR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_16c; procPackB = &Gemm::PackMatrixB_omp_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_8c; procPackB = &Gemm::PackMatrixB_omp_8c;
procAddDot = AddDot6x8; procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
procPackB(KC, NC, NC % NR, B, ldb, packedB); (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
} else { } else {
...@@ -3508,18 +3517,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, ...@@ -3508,18 +3517,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
MC = (m + MR - 1) / MR * MR; MC = (m + MR - 1) / MR * MR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_omp_6r; procPackA = &Gemm::PackMatrixA_omp_6r;
procPackB = PackMatrixB_16c; procPackB = &Gemm::PackMatrixB_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_omp_6r; procPackA = &Gemm::PackMatrixA_omp_6r;
procPackB = PackMatrixB_8c; procPackB = &Gemm::PackMatrixB_8c;
procAddDot = AddDot6x8; procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
procPackA(MC, KC, MC % MR, A, lda, packedA); (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
} }
...@@ -3541,7 +3550,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, ...@@ -3541,7 +3550,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
float *local_A = packedA + MC * KC * local_threads; float *local_A = packedA + MC * KC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A); (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
if (bias1 == nullptr) { if (bias1 == nullptr) {
InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc, InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
p + i, mode, bias + i, nullptr); p + i, mode, bias + i, nullptr);
...@@ -3563,7 +3572,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, ...@@ -3563,7 +3572,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
nc = s_min(n - j, NC); nc = s_min(n - j, NC);
float *local_B = packedB + KC * NC * local_threads; float *local_B = packedB + KC * NC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B); (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
if (bias1 == nullptr) { if (bias1 == nullptr) {
InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p, InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
mode, bias, nullptr); mode, bias, nullptr);
...@@ -3580,7 +3589,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, ...@@ -3580,7 +3589,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__ #if __aarch64__
...@@ -3867,7 +3876,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -3867,7 +3876,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
} }
#if __aarch64__ #if __aarch64__
void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot8x12(int k, const float *a, const float *b, float *c,
int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -3956,7 +3966,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -3956,7 +3966,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
} }
void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot6x16(int k, const float *a, const float *b, float *c,
int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
......
...@@ -35,7 +35,9 @@ namespace paddle_mobile { ...@@ -35,7 +35,9 @@ namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
/* class Gemm {
public:
/*
// 将 A 矩阵分块复制到连续内存(ColMajor) // 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
...@@ -44,138 +46,156 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, ...@@ -44,138 +46,156 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
*/ */
typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *);
// 将 A 矩阵分块复制到连续内存(RowMajor) typedef void (Gemm::*FnAddDot)(int, const float *, const float *, float *,
void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda, int);
FnPack procPackA;
FnPack procPackB;
FnAddDot procAddDot;
// 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
// 将 B 矩阵分块复制到连续内存(RowMajor) // 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
float beta, float *c, float *C, int ldc, bool relu); float beta, float *c, float *C, int ldc, bool relu);
void InnerKernelWithBias(int mc, int nc, float alpha, const float *a, void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *bias); int ldc, bool relu, float *bias);
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *new_scale, float *new_bias, int ldc, bool relu, float *new_scale, float *new_bias);
float *bias); void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *new_scale,
float *new_bias, float *bias);
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
float *c, float *C, int ldc, float *p, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
/* /*
// 向量矩阵乘法 (M = 1) // 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu); bool relu);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C, int lda, const float *B, int ldb, float beta, float
int ldc, bool relu, float *new_scale, float *new_bias); *C, int ldc, bool relu, float *new_scale, float *new_bias);
*/ */
// 计算一个更小的 C 矩阵分块 // 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc); void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc); void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc); void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc); void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc); void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
// 分块矩阵乘法结果回写 // 分块矩阵乘法结果回写
// C = A * B // C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc); void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc); void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C // C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc); void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + bias // C = A * B + bias
void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias); void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc); void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C,prelu(C) // C = A * B + C,prelu(C)
void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
// C = A * B + bias ,relu(C) // C = A * B + bias ,relu(C)
void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
float *bias); float *bias);
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias); float *new_scale, float *new_bias);
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, // C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias);
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1); float *new_scale, float *new_bias, float *bias1);
/* /*
// 向量矩阵乘法结果回写 // 向量矩阵乘法结果回写
// C = A * B // C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc); void VecWriteBasic(int n, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc); void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// C = A * B + C // C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc); void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc); void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale, void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias); float *new_bias);
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale, void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias); float *new_bias);
*/ */
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu, const float *B, int ldb, float beta, float *C, int ldc, bool relu,
float *bias); float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom // 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias, float *bias); bool relu, float *new_scale, float *new_bias, float *bias);
void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
// 32位 float 矩阵乘法(openmp 多线程版本) // 32位 float 矩阵乘法(openmp 多线程版本)
void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias); bool relu, float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本) // 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
const float *B, int ldb, float beta, float *C, int ldc, int lda, const float *B, int ldb, float beta, float *C,
bool relu, float *new_scale, float *new_bias, float *bias); int ldc, bool relu, float *new_scale, float *new_bias,
float *bias);
void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
private:
int MC = 0;
int KC = 0;
int NC = 0;
float *packedA;
float *packedB;
float *packedC;
float *zero;
};
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -28,19 +28,22 @@ struct GRUUnitFunctor<CPU, T> { ...@@ -28,19 +28,22 @@ struct GRUUnitFunctor<CPU, T> {
static void compute(GRUMetaValue<T> value, int frame_size, int batch_size, static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
const ActivationType active_node, const ActivationType active_node,
const ActivationType active_gate) { const ActivationType active_gate) {
Gemm gemm;
if (value.prev_out_value) { if (value.prev_out_value) {
Sgemm(batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value, value.prev_out_value, frame_size, value.gate_weight,
frame_size * 3, false, nullptr); frame_size * 2, 1, value.gate_value, frame_size * 3, false,
nullptr);
} }
forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size, forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
batch_size, active_gate); batch_size, active_gate);
if (value.prev_out_value) { if (value.prev_out_value) {
Sgemm(batch_size, frame_size, frame_size, 1, value.reset_output_value, gemm.Sgemm(batch_size, frame_size, frame_size, 1,
frame_size, value.state_weight, frame_size, 1, value.reset_output_value, frame_size, value.state_weight,
value.gate_value + frame_size * 2, frame_size * 3, false, nullptr); frame_size, 1, value.gate_value + frame_size * 2,
frame_size * 3, false, nullptr);
} }
forward_final_output(forward::gru_finalOutput<T>(), value, frame_size, forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,
......
...@@ -36,6 +36,7 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -36,6 +36,7 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
Gemm gemm;
if (trans_a) { if (trans_a) {
int numel = matrix_a.numel(); int numel = matrix_a.numel();
...@@ -50,20 +51,24 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -50,20 +51,24 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
a[index++] = tmp[i * n + j]; a[index++] = tmp[i * n + j];
} }
} }
#ifdef _OPENMP #ifdef _OPENMP
Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias); matrix_out->data<float>(), N, relu, bias);
#else #else
Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta, gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias); matrix_out->data<float>(), N, relu, bias);
#endif #endif
} else { } else {
#ifdef _OPENMP #ifdef _OPENMP
Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K,
N, beta, matrix_out->data<float>(), N, relu, bias); matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
N, relu, bias);
#else #else
Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N, gemm.Sgemm(M, N, K, alpha, matrix_a.data<float>(), K,
beta, matrix_out->data<float>(), N, relu, bias); matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
relu, bias);
#endif #endif
} }
} }
...@@ -74,6 +79,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -74,6 +79,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
float alpha, framework::Tensor *matrix_out, float beta, float alpha, framework::Tensor *matrix_out, float beta,
bool relu, framework::Tensor *new_scale, bool relu, framework::Tensor *new_scale,
framework::Tensor *new_bias, int group, float *bias) { framework::Tensor *new_bias, int group, float *bias) {
Gemm gemm;
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -86,21 +92,22 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -86,21 +92,22 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
#ifdef _OPENMP #ifdef _OPENMP
SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K, gemm.SgemmWithBn_omp(
matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N, M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
relu, new_scale->data<float>() + group, beta, matrix_out->data<float>(), N, relu,
new_bias->data<float>() + group, bias); new_scale->data<float>() + group, new_bias->data<float>() + group, bias);
#else #else
SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), gemm.SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K,
N, beta, matrix_out->data<float>(), N, relu, matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
new_scale->data<float>() + group, new_bias->data<float>() + group, N, relu, new_scale->data<float>() + group,
bias); new_bias->data<float>() + group, bias);
#endif #endif
} }
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
framework::Tensor *matrix_out, float *p, std::string mode, framework::Tensor *matrix_out, float *p, std::string mode,
float *bias, float *bias1) { float *bias, float *bias1) {
Gemm gemm;
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -113,11 +120,13 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, ...@@ -113,11 +120,13 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
#ifdef _OPENMP #ifdef _OPENMP
SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(), gemm.SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K,
N, matrix_out->data<float>(), N, p, mode, bias, bias1); matrix_b.data<float>(), N, matrix_out->data<float>(),
N, p, mode, bias, bias1);
#else #else
SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(), N, gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K,
matrix_out->data<float>(), N, p, mode, bias, bias1); matrix_b.data<float>(), N, matrix_out->data<float>(), N,
p, mode, bias, bias1);
#endif #endif
} }
......
...@@ -35,8 +35,8 @@ if (CON GREATER -1) ...@@ -35,8 +35,8 @@ if (CON GREATER -1)
ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-yolo paddle-mobile) target_link_libraries(test-yolo paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-yolo-combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test_yolo_combined paddle-mobile) target_link_libraries(test-yolo-combined paddle-mobile)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif () endif ()
...@@ -323,5 +323,10 @@ if (NOT FOUND_MATCH) ...@@ -323,5 +323,10 @@ if (NOT FOUND_MATCH)
target_link_libraries(test-fssd paddle-mobile) target_link_libraries(test-fssd paddle-mobile)
# gen test
ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
target_link_libraries(test-multi-process paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif () endif ()
...@@ -83,8 +83,9 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) { ...@@ -83,8 +83,9 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
} }
} }
paddle_mobile::operators::math::SgemmWithBn( paddle_mobile::operators::math::Gemm gemm;
m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, nullptr); gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
nullptr);
int eq = 0; int eq = 0;
int neq = 0; int neq = 0;
for (int i = 0; i < m * n; ++i) { for (int i = 0; i < m * n; ++i) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <thread> // NOLINT
#include "../test_helper.h"
#include "../test_include.h"
void fun_yolo();
int fun_mobilenet();
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile2;
// fun_yolo();
// fun_mobilenet();
std::thread t1(fun_yolo);
std::thread t2(fun_mobilenet);
t1.join();
t2.join();
return 0;
}
void fun_yolo() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
auto time1 = time();
if (paddle_mobile.Load(g_yolo, true)) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
vector<int64_t> dims{1, 3, 227, 227};
Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
static_cast<float>(1));
vector<float> input(input_tensor.data<float>(),
input_tensor.data<float>() + input_tensor.numel());
auto time3 = time();
for (int i = 0; i < 10; ++i) {
paddle_mobile.Predict(input, dims);
}
auto time4 = time();
std::cout << "thread 1: predict cost :" << time_diff(time3, time4) / 10
<< "ms" << std::endl;
}
}
int fun_mobilenet() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
auto time1 = time();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_mobilenet, true);
if (isok) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
vector<float> input;
vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
auto vec_result = paddle_mobile.Predict(input, dims);
auto biggest = max_element(begin(vec_result), end(vec_result));
std::cout << " Max element is " << *biggest << " at position "
<< distance(begin(vec_result), biggest) << std::endl;
// 预热十次
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
auto time3 = time();
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
DLOG << vec_result;
auto time4 = time();
std::cout << "thread 2: predict cost :" << time_diff(time3, time4) / 10
<< "ms" << std::endl;
}
std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
"是否存在?"
<< std::endl;
return 0;
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册