diff --git a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h index 8658b058f08313acfb29c34d8d40c75edcb858f2..7c31eed19693d20084e25daa485a0553d5d795f2 100644 --- a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h @@ -108,10 +108,10 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam ¶m) { Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step); - math::matmulWithBnAdd(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(1), true, &new_scale, - &new_bias, g, bias_data.data()); + math::matmulWithBn(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(1), true, &new_scale, + &new_bias, g, bias_data.data()); } } } diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index cd03b5e263b72a86a99570cb5e4dd0ad6d717afb..9f0a18f04f9f247cc06ccf73a36b574cb19d92ad 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -2962,7 +2962,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *new_scale, float *new_bias) { + bool relu, float *new_scale, float *new_bias, float *bias) { // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L2 cache is 0.5~4 Mib (Contex-A72 cluster) int L1 = 32 * 1024; @@ -3009,70 +3009,14 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, #else PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); #endif - InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, - &C(i, j), ldc, relu, new_scale + i, new_bias + i); - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); - paddle_mobile::memory::Free(zero); -} - -void SgemmWithBnAdd(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *new_scale, float *new_bias, float *bias) { - // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) - // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int L1 = 32 * 1024; - int L2 = 512 * 1024; - - KC = k; - MC = L1 / (KC * sizeof(float)); - NC = L2 / (KC * sizeof(float)); - - // make sure MC is multiple of MR, and NC is multiple of NR - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; - // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; - // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); - zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - memset(static_cast(zero), 0, sizeof(float) * KC); - - int mc, nc; - for (int j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); -#if __aarch64__ - // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB); - PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB); -#else - PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB); -#endif - for (int i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); -#if __aarch64__ - PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); - // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA); -#else - PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); -#endif - - InnerKernelWithBnAdd(mc, nc, alpha, packedA, packedB, beta, packedC, - &C(i, j), ldc, relu, new_scale + i, new_bias + i, - bias + i * ldc + j); + if (bias == nullptr) { + InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, + &C(i, j), ldc, relu, new_scale + i, new_bias + i); + } else { + InnerKernelWithBnAdd(mc, nc, alpha, packedA, packedB, beta, packedC, + &C(i, j), ldc, relu, new_scale + i, new_bias + i, + bias + i * ldc + j); + } } } @@ -3260,115 +3204,8 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *new_scale, float *new_bias) { -#ifdef _OPENMP - int max_threads = omp_get_max_threads(); -#else - int max_threads = 1; -#endif - - int L1 = 64 / max_threads * 1024; - KC = k; - if (m > n) { - // 对 A 分块 - MC = L1 / (KC * sizeof(float)); - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; - // 补齐 B - NC = (n + NR - 1) / NR * NR; - -#if __aarch64__ - procPackA = PackMatrixA_6r; - procPackB = PackMatrixB_omp_16c; - procAddDot = AddDot6x16; -#else - procPackA = PackMatrixA_6r; - procPackB = PackMatrixB_omp_8c; - procAddDot = AddDot6x8; -#endif - - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - procPackB(KC, NC, NC % NR, B, ldb, packedB); - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); - } else { - // 对 B 分块 - NC = L1 / (KC * sizeof(float)); - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; - // 补齐 A - MC = (m + MR - 1) / MR * MR; - -#if __aarch64__ - procPackA = PackMatrixA_omp_6r; - procPackB = PackMatrixB_16c; - procAddDot = AddDot6x16; -#else - procPackA = PackMatrixA_omp_6r; - procPackB = PackMatrixB_8c; - procAddDot = AddDot6x8; -#endif - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - procPackA(MC, KC, MC % MR, A, lda, packedA); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); - } - zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - memset(static_cast(zero), 0, sizeof(float) * KC); - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads)); - - if (m > n) { -#pragma omp parallel for - for (int i = 0; i < m; i += MC) { -#ifdef _OPENMP - int local_threads = omp_get_thread_num(); -#else - int local_threads = 0; -#endif - - int mc; - mc = s_min(m - i, MC); - float *local_A = packedA + MC * KC * local_threads; - float *local_C = packedC + MC * NC * local_threads; - procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A); - InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C, &C(i, 0), - ldc, relu, new_scale + i, new_bias + i); - } - } else { -#pragma omp parallel for - for (int j = 0; j < n; j += NC) { -#ifdef _OPENMP - int local_threads = omp_get_thread_num(); -#else - int local_threads = 0; -#endif - - int nc; - nc = s_min(n - j, NC); - float *local_B = packedB + KC * NC * local_threads; - float *local_C = packedC + MC * NC * local_threads; - procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B); - InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C, &C(0, j), - ldc, relu, new_scale, new_bias); - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); - paddle_mobile::memory::Free(zero); -} - -void SgemmWithBnAdd_omp(int m, int n, int k, float alpha, const float *A, - int lda, const float *B, int ldb, float beta, float *C, - int ldc, bool relu, float *new_scale, float *new_bias, - float *bias) { + bool relu, float *new_scale, float *new_bias, + float *bias) { #ifdef _OPENMP int max_threads = omp_get_max_threads(); #else @@ -3445,9 +3282,14 @@ void SgemmWithBnAdd_omp(int m, int n, int k, float alpha, const float *A, float *local_A = packedA + MC * KC * local_threads; float *local_C = packedC + MC * NC * local_threads; procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A); - InnerKernelWithBnAdd(mc, n, alpha, local_A, packedB, beta, local_C, - &C(i, 0), ldc, relu, new_scale + i, new_bias + i, - bias + i * ldc); + if (bias == nullptr) { + InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C, + &C(i, 0), ldc, relu, new_scale + i, new_bias + i); + } else { + InnerKernelWithBnAdd(mc, n, alpha, local_A, packedB, beta, local_C, + &C(i, 0), ldc, relu, new_scale + i, new_bias + i, + bias + i * ldc); + } } } else { #pragma omp parallel for @@ -3463,8 +3305,14 @@ void SgemmWithBnAdd_omp(int m, int n, int k, float alpha, const float *A, float *local_B = packedB + KC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads; procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B); - InnerKernelWithBnAdd(m, nc, alpha, packedA, local_B, beta, local_C, - &C(0, j), ldc, relu, new_scale, new_bias, bias + j); + if (bias == nullptr) { + InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C, + &C(0, j), ldc, relu, new_scale, new_bias); + } else { + InnerKernelWithBnAdd(m, nc, alpha, packedA, local_B, beta, local_C, + &C(0, j), ldc, relu, new_scale, new_bias, + bias + j); + } } } diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index 2378d59c55217fc0ecf07a99626984aa21533014..abd209bb45c650363b7d19c495bea4d9848fc834 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -157,10 +157,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, // 32位 float 矩阵乘法, 并对结果进行 batchnrom void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *new_scale, float *new_bias); -void SgemmWithBnAdd(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *new_scale, float *new_bias, float *bias); + bool relu, float *new_scale, float *new_bias, float *bias); void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, const float *B, int ldb, float *C, int ldc, float *p, std::string mode, float *bias, float *bias1); @@ -173,12 +170,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, // 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本) void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *new_scale, float *new_bias); -// 32位 float 矩阵乘法, 并对结果进行 batchnorm和add(openmp 多线程版本) -void SgemmWithBnAdd_omp(int m, int n, int k, float alpha, const float *A, - int lda, const float *B, int ldb, float beta, float *C, - int ldc, bool relu, float *new_scale, float *new_bias, - float *bias); + bool relu, float *new_scale, float *new_bias, float *bias); void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, const float *B, int ldb, float *C, int ldc, float *p, diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp index dca8be83ef1277e1d74f77bf7cec1c03d35b00d4..576b06422cd0665d9e211633ce2f559e73c11fb5 100644 --- a/src/operators/math/math_function.cpp +++ b/src/operators/math/math_function.cpp @@ -56,7 +56,7 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, float alpha, framework::Tensor *matrix_out, float beta, bool relu, framework::Tensor *new_scale, - framework::Tensor *new_bias, int group) { + framework::Tensor *new_bias, int group, float *bias) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -79,49 +79,12 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, SgemmWithBn_omp(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, beta, matrix_out->data(), N, relu, new_scale->data() + group, - new_bias->data() + group); + new_bias->data() + group, bias); #else SgemmWithBn(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, beta, matrix_out->data(), N, relu, - new_scale->data() + group, - new_bias->data() + group); -#endif -} -template <> -void matmulWithBnAdd(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, - float alpha, framework::Tensor *matrix_out, - float beta, bool relu, framework::Tensor *new_scale, - framework::Tensor *new_bias, int group, - float *bias) { - auto dim_a = matrix_a.dims(); - auto dim_b = matrix_b.dims(); - auto dim_out = matrix_out->dims(); - // PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && - // dim_out.size() == - // 2, - // "The input and output of matmul be matrix"); - // - // PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) && - // platform::is_cpu_place(matrix_b.place()) - // && - // platform::is_cpu_place(matrix_out->place()), - // "Matrix must all be in CPUPlace"); - - int M = dim_out[0]; - int N = dim_out[1]; - int K = (!trans_a) ? dim_a[1] : dim_a[0]; - -#ifdef _OPENMP - SgemmWithBnAdd_omp(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), - N, relu, new_scale->data() + group, - new_bias->data() + group, bias); -#else - SgemmWithBnAdd(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), N, - relu, new_scale->data() + group, - new_bias->data() + group, bias); + new_scale->data() + group, new_bias->data() + group, + bias); #endif } void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h index 9b51743999e40c999eddef87fc571cedc7f3171a..8d97f8628fb4f71cdd7664161983225136ec7c7f 100644 --- a/src/operators/math/math_function.h +++ b/src/operators/math/math_function.h @@ -32,13 +32,7 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, T alpha, framework::Tensor *matrix_out, T beta, bool relu, framework::Tensor *new_scale, framework::Tensor *new_bias, - int group); -template -void matmulWithBnAdd(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, - float alpha, framework::Tensor *matrix_out, float beta, - bool relu, framework::Tensor *new_scale, - framework::Tensor *new_bias, int group, float *bias); + int group, float *bias = nullptr); void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, diff --git a/test/common/test_gemm_accuracy.cpp b/test/common/test_gemm_accuracy.cpp index 35241fbd535e062be1c7f1f28eb3860d118a3455..3e31a5f2fe9b41f90f9aebfe44db908682f83ce1 100644 --- a/test/common/test_gemm_accuracy.cpp +++ b/test/common/test_gemm_accuracy.cpp @@ -83,8 +83,8 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) { } } - paddle_mobile::operators::math::SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, - c, ldc, relu, scale, bias); + paddle_mobile::operators::math::SgemmWithBn( + m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, nullptr); int eq = 0; int neq = 0; for (int i = 0; i < m * n; ++i) {