提交 0c80a592 编写于 作者: Y yangfei

imp fusion_conv_bn_add op in resnet

上级 bb6230ac
...@@ -108,7 +108,7 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) { ...@@ -108,7 +108,7 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) {
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step); Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBnAdd<float>(filter_slice, false, col_matrix, false, math::matmulWithBn<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(1), true, &new_scale, static_cast<float>(1), true, &new_scale,
&new_bias, g, bias_data.data<float>()); &new_bias, g, bias_data.data<float>());
......
...@@ -2962,7 +2962,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -2962,7 +2962,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias) { bool relu, float *new_scale, float *new_bias, float *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 32 * 1024; int L1 = 32 * 1024;
...@@ -3009,72 +3009,16 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3009,72 +3009,16 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
#else #else
PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#endif #endif
if (bias == nullptr) {
InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
&C(i, j), ldc, relu, new_scale + i, new_bias + i); &C(i, j), ldc, relu, new_scale + i, new_bias + i);
} } else {
}
paddle_mobile::memory::Free(packedA);
paddle_mobile::memory::Free(packedB);
paddle_mobile::memory::Free(packedC);
paddle_mobile::memory::Free(zero);
}
void SgemmWithBnAdd(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias, float *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 32 * 1024;
int L2 = 512 * 1024;
KC = k;
MC = L1 / (KC * sizeof(float));
NC = L2 / (KC * sizeof(float));
// make sure MC is multiple of MR, and NC is multiple of NR
int mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR - 1) / MR * MR;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR - 1) / NR * NR;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
packedC = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
int mc, nc;
for (int j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
#if __aarch64__
// PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
#else
PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
#endif
for (int i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
#if __aarch64__
PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
// PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#else
PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#endif
InnerKernelWithBnAdd(mc, nc, alpha, packedA, packedB, beta, packedC, InnerKernelWithBnAdd(mc, nc, alpha, packedA, packedB, beta, packedC,
&C(i, j), ldc, relu, new_scale + i, new_bias + i, &C(i, j), ldc, relu, new_scale + i, new_bias + i,
bias + i * ldc + j); bias + i * ldc + j);
} }
} }
}
paddle_mobile::memory::Free(packedA); paddle_mobile::memory::Free(packedA);
paddle_mobile::memory::Free(packedB); paddle_mobile::memory::Free(packedB);
...@@ -3260,114 +3204,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3260,114 +3204,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias) { bool relu, float *new_scale, float *new_bias,
#ifdef _OPENMP
int max_threads = omp_get_max_threads();
#else
int max_threads = 1;
#endif
int L1 = 64 / max_threads * 1024;
KC = k;
if (m > n) {
// 对 A 分块
MC = L1 / (KC * sizeof(float));
int mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR - 1) / MR * MR;
// 补齐 B
NC = (n + NR - 1) / NR * NR;
#if __aarch64__
procPackA = PackMatrixA_6r;
procPackB = PackMatrixB_omp_16c;
procAddDot = AddDot6x16;
#else
procPackA = PackMatrixA_6r;
procPackB = PackMatrixB_omp_8c;
procAddDot = AddDot6x8;
#endif
packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
procPackB(KC, NC, NC % NR, B, ldb, packedB);
packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
} else {
// 对 B 分块
NC = L1 / (KC * sizeof(float));
int nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR - 1) / NR * NR;
// 补齐 A
MC = (m + MR - 1) / MR * MR;
#if __aarch64__
procPackA = PackMatrixA_omp_6r;
procPackB = PackMatrixB_16c;
procAddDot = AddDot6x16;
#else
procPackA = PackMatrixA_omp_6r;
procPackB = PackMatrixB_8c;
procAddDot = AddDot6x8;
#endif
packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
procPackA(MC, KC, MC % MR, A, lda, packedA);
packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
}
zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
packedC = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
if (m > n) {
#pragma omp parallel for
for (int i = 0; i < m; i += MC) {
#ifdef _OPENMP
int local_threads = omp_get_thread_num();
#else
int local_threads = 0;
#endif
int mc;
mc = s_min(m - i, MC);
float *local_A = packedA + MC * KC * local_threads;
float *local_C = packedC + MC * NC * local_threads;
procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C, &C(i, 0),
ldc, relu, new_scale + i, new_bias + i);
}
} else {
#pragma omp parallel for
for (int j = 0; j < n; j += NC) {
#ifdef _OPENMP
int local_threads = omp_get_thread_num();
#else
int local_threads = 0;
#endif
int nc;
nc = s_min(n - j, NC);
float *local_B = packedB + KC * NC * local_threads;
float *local_C = packedC + MC * NC * local_threads;
procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C, &C(0, j),
ldc, relu, new_scale, new_bias);
}
}
paddle_mobile::memory::Free(packedA);
paddle_mobile::memory::Free(packedB);
paddle_mobile::memory::Free(packedC);
paddle_mobile::memory::Free(zero);
}
void SgemmWithBnAdd_omp(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias,
float *bias) { float *bias) {
#ifdef _OPENMP #ifdef _OPENMP
int max_threads = omp_get_max_threads(); int max_threads = omp_get_max_threads();
...@@ -3445,10 +3282,15 @@ void SgemmWithBnAdd_omp(int m, int n, int k, float alpha, const float *A, ...@@ -3445,10 +3282,15 @@ void SgemmWithBnAdd_omp(int m, int n, int k, float alpha, const float *A,
float *local_A = packedA + MC * KC * local_threads; float *local_A = packedA + MC * KC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A); procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
if (bias == nullptr) {
InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C,
&C(i, 0), ldc, relu, new_scale + i, new_bias + i);
} else {
InnerKernelWithBnAdd(mc, n, alpha, local_A, packedB, beta, local_C, InnerKernelWithBnAdd(mc, n, alpha, local_A, packedB, beta, local_C,
&C(i, 0), ldc, relu, new_scale + i, new_bias + i, &C(i, 0), ldc, relu, new_scale + i, new_bias + i,
bias + i * ldc); bias + i * ldc);
} }
}
} else { } else {
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < n; j += NC) { for (int j = 0; j < n; j += NC) {
...@@ -3463,8 +3305,14 @@ void SgemmWithBnAdd_omp(int m, int n, int k, float alpha, const float *A, ...@@ -3463,8 +3305,14 @@ void SgemmWithBnAdd_omp(int m, int n, int k, float alpha, const float *A,
float *local_B = packedB + KC * NC * local_threads; float *local_B = packedB + KC * NC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B); procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
if (bias == nullptr) {
InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C,
&C(0, j), ldc, relu, new_scale, new_bias);
} else {
InnerKernelWithBnAdd(m, nc, alpha, packedA, local_B, beta, local_C, InnerKernelWithBnAdd(m, nc, alpha, packedA, local_B, beta, local_C,
&C(0, j), ldc, relu, new_scale, new_bias, bias + j); &C(0, j), ldc, relu, new_scale, new_bias,
bias + j);
}
} }
} }
......
...@@ -156,9 +156,6 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -156,9 +156,6 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
// 32位 float 矩阵乘法, 并对结果进行 batchnrom // 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
void SgemmWithBnAdd(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias, float *bias); bool relu, float *new_scale, float *new_bias, float *bias);
void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
...@@ -173,12 +170,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -173,12 +170,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本) // 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias); bool relu, float *new_scale, float *new_bias, float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnorm和add(openmp 多线程版本)
void SgemmWithBnAdd_omp(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias,
float *bias);
void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
......
...@@ -56,7 +56,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -56,7 +56,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out, float beta, float alpha, framework::Tensor *matrix_out, float beta,
bool relu, framework::Tensor *new_scale, bool relu, framework::Tensor *new_scale,
framework::Tensor *new_bias, int group) { framework::Tensor *new_bias, int group, float *bias) {
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -79,49 +79,12 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -79,49 +79,12 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K, SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N, matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
relu, new_scale->data<float>() + group, relu, new_scale->data<float>() + group,
new_bias->data<float>() + group); new_bias->data<float>() + group, bias);
#else #else
SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
N, beta, matrix_out->data<float>(), N, relu, N, beta, matrix_out->data<float>(), N, relu,
new_scale->data<float>() + group, new_scale->data<float>() + group, new_bias->data<float>() + group,
new_bias->data<float>() + group); bias);
#endif
}
template <>
void matmulWithBnAdd<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out,
float beta, bool relu, framework::Tensor *new_scale,
framework::Tensor *new_bias, int group,
float *bias) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
// PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
// dim_out.size() ==
// 2,
// "The input and output of matmul be matrix");
//
// PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
// platform::is_cpu_place(matrix_b.place())
// &&
// platform::is_cpu_place(matrix_out->place()),
// "Matrix must all be in CPUPlace");
int M = dim_out[0];
int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0];
#ifdef _OPENMP
SgemmWithBnAdd_omp(M, N, K, alpha, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
N, relu, new_scale->data<float>() + group,
new_bias->data<float>() + group, bias);
#else
SgemmWithBnAdd(M, N, K, alpha, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
relu, new_scale->data<float>() + group,
new_bias->data<float>() + group, bias);
#endif #endif
} }
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
......
...@@ -32,13 +32,7 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, ...@@ -32,13 +32,7 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha, const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu, framework::Tensor *matrix_out, T beta, bool relu,
framework::Tensor *new_scale, framework::Tensor *new_bias, framework::Tensor *new_scale, framework::Tensor *new_bias,
int group); int group, float *bias = nullptr);
template <typename T>
void matmulWithBnAdd(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out, float beta,
bool relu, framework::Tensor *new_scale,
framework::Tensor *new_bias, int group, float *bias);
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
......
...@@ -83,8 +83,8 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) { ...@@ -83,8 +83,8 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
} }
} }
paddle_mobile::operators::math::SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, paddle_mobile::operators::math::SgemmWithBn(
c, ldc, relu, scale, bias); m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, nullptr);
int eq = 0; int eq = 0;
int neq = 0; int neq = 0;
for (int i = 0; i < m * n; ++i) { for (int i = 0; i < m * n; ++i) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册