diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h index da9e5b82ff82b102d92c4a2268597f3580ee9abb..c01a068fb9732b64da4097844736f7484fdfcab9 100644 --- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h @@ -106,9 +106,9 @@ void ConvAddBasic(const FusionConvAddParam ¶m) { // gemm Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::matmulWithBias(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(1), false, biase_data); + math::matmul(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(1), false, biase_data); } } } diff --git a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h index 1849b87ca076f79a2b2f14b3e387bd757f96a3f4..7845a2e98b6f34fd4bf3f5c8fcaa238d69b26c41 100644 --- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h @@ -109,9 +109,9 @@ void ConvAddReluCompute(const FusionConvAddReluParam ¶m) { // gemm Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::matmulWithBias(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(1), true, biase_data); + math::matmul(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(1), true, biase_data); } } } diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index 33caded3afaaf125bac9108f2fafeda3d3c2049f..41acb973409d9655ae47a8655c1cb527e9563775 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -30,6 +30,7 @@ inline void ConvBasic(const ConvParam ¶m) { Tensor filter = *param.Filter(); Tensor *output = param.Output(); output->mutable_data(); + float *bias_data = output->mutable_data(); int groups = param.Groups(); std::vector strides = param.Strides(); std::vector paddings = param.Paddings(); @@ -106,7 +107,7 @@ inline void ConvBasic(const ConvParam ¶m) { Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); math::matmul(filter_slice, false, col_matrix, false, static_cast(1), &out_slice, - static_cast(0)); + static_cast(0), false, bias_data); } } } diff --git a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h index f23c86fef1819c66676c82feed71bafe0cb96e8e..7b5f18c15c82bada35c59117c106bc4d475bb72e 100644 --- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h +++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h @@ -28,6 +28,7 @@ void FusionFcCompute(const FusionFcParam ¶m) { int axis = param.Axis(); Tensor *out = param.Out(); auto *out_data = out->mutable_data(); + float *bias_data = out->mutable_data(); const Tensor x_matrix = input_x->dims().size() > 2 ? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) @@ -56,7 +57,7 @@ void FusionFcCompute(const FusionFcParam ¶m) { // DLOG << out_data[i]; // } math::matmul(x_matrix, false, y_matrix, false, static_cast(1), - out, static_cast(1)); + out, static_cast(1), false, bias_data); PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); // if (out_dim.size() != 2) { // out->Resize(out_dim); diff --git a/src/operators/kernel/central-arm-func/mul_arm_func.h b/src/operators/kernel/central-arm-func/mul_arm_func.h index d2da67afe1d2eb746971a2443bdb449eb2b66ec4..341759a96e1e7216fb9550596d3d3533dd0ab80a 100644 --- a/src/operators/kernel/central-arm-func/mul_arm_func.h +++ b/src/operators/kernel/central-arm-func/mul_arm_func.h @@ -59,6 +59,7 @@ void MulCompute(const MulParam ¶m) { const Tensor *input_y = param.InputY(); Tensor *out = param.Out(); out->mutable_data(); + float *bias_data = out->mutable_data(); const Tensor x_matrix = input_x->dims().size() > 2 ? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) @@ -72,7 +73,7 @@ void MulCompute(const MulParam ¶m) { out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); } math::matmul(x_matrix, false, y_matrix, false, static_cast(1), - out, static_cast(0)); + out, static_cast(0), false, bias_data); if (out_dim.size() != 2) { out->Resize(out_dim); } diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index 580b8b77647ce5be5fa5ec094e571c34b32220ab..ef1625b72c54b168eb3b58a4126d2500fbfe561f 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -2248,69 +2248,8 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { // 32位 float 矩阵乘法 void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, bool relu) { - // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) - // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int L1 = 32 * 1024; - int L2 = 0.5 * 1024 * 1024; - - KC = k; - MC = L1 / (KC * sizeof(float)); - NC = L2 / (KC * sizeof(float)); - - // make sure MC is multiple of MR, and NC is multiple of NR - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; - // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; - // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); - zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - - for (int l = 0; l < KC; ++l) { - zero[l] = 0; - } - - int mc, nc; - for (int j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); -#if __aarch64__ - // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB); - PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB); -#else - PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB); -#endif - for (int i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); -#if __aarch64__ - PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); - // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA); -#else - PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); -#endif - InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc, - relu); - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); - paddle_mobile::memory::Free(zero); -} -void SgemmWithBias(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *bias) { + const float *B, int ldb, float beta, float *C, int ldc, bool relu, + float *bias) { // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L2 cache is 0.5~4 Mib (Contex-A72 cluster) int L1 = 32 * 1024; diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index aa0958c1bf96aaba52bb762efec8b0fd252d0db1..625fce0323580545c1655c1d3c325f995aa054f2 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -128,10 +128,8 @@ void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale, // 32位 float 矩阵乘法 void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, bool relu); -void SgemmWithBias(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *bias); + const float *B, int ldb, float beta, float *C, int ldc, bool relu, + float *bias); // 32位 float 矩阵乘法, 并对结果进行 batchnrom void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp index e024609863ccb95506b4543732354e6424445205..9ac8d79e89b7a577f0a89807dc96c9f368fed6de 100644 --- a/src/operators/math/math_function.cpp +++ b/src/operators/math/math_function.cpp @@ -22,7 +22,8 @@ namespace math { template <> void matmul(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, float alpha, - framework::Tensor *matrix_out, float beta, bool relu) { + framework::Tensor *matrix_out, float beta, bool relu, + float *bias) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -42,34 +43,7 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, int K = (!trans_a) ? dim_a[1] : dim_a[0]; Sgemm(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, - beta, matrix_out->data(), N, relu); -} -template <> -void matmulWithBias(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, - float alpha, framework::Tensor *matrix_out, - float beta, bool relu, float *bias) { - auto dim_a = matrix_a.dims(); - auto dim_b = matrix_b.dims(); - auto dim_out = matrix_out->dims(); - // PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && - // dim_out.size() == - // 2, - // "The input and output of matmul be matrix"); - // - // PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) && - // platform::is_cpu_place(matrix_b.place()) - // && - // platform::is_cpu_place(matrix_out->place()), - // "Matrix must all be in CPUPlace"); - - int M = dim_out[0]; - int N = dim_out[1]; - int K = (!trans_a) ? dim_a[1] : dim_a[0]; - - SgemmWithBias(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), N, - relu, bias); + beta, matrix_out->data(), N, relu, bias); } template <> diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h index 8bb3c03f80bcdc33db56ce4b82395d841a211863..0f281e713426beeef94a86c9c82105879b07cc80 100644 --- a/src/operators/math/math_function.h +++ b/src/operators/math/math_function.h @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once +#pragma oncenki #include #include "framework/tensor.h" @@ -21,16 +21,10 @@ namespace paddle_mobile { namespace operators { namespace math { -// matrix multiply with continuous memory template void matmul(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, T alpha, - framework::Tensor *matrix_out, T beta, bool relu = false); -template -void matmulWithBias(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, T alpha, - framework::Tensor *matrix_out, T beta, bool relu, - float *bias); + framework::Tensor *matrix_out, T beta, bool relu, float *bias); template void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp index 260236e24ea44a6fc5708d4d0dac239252d28945..9b62ffe9e17790f068caaa94e9e82c53e5c2d9fb 100644 --- a/test/common/test_gemm_perf.cpp +++ b/test/common/test_gemm_perf.cpp @@ -49,9 +49,9 @@ int main() { auto time1 = time(); for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::matmul(aa, false, bb, false, - static_cast(1), &cc, - static_cast(0), false); + paddle_mobile::operators::math::matmul( + aa, false, bb, false, static_cast(1), &cc, static_cast(0), + false, ccptr); // paddle_mobile::operators::math::matmulWithBn( // aa, false, bb, false, static_cast(1), &cc,