diff --git a/.gitignore b/.gitignore index a6225fe41666b36a806360694b51984bdaf788f3..fb2c622c2d51a8a3f4a3bfb9a8088eebb5d2e134 100644 --- a/.gitignore +++ b/.gitignore @@ -57,3 +57,13 @@ cmake-build-debug/ cmake-build-release/ test/models/ + +# Emacs intermediate files +*~ + +# CMake building directory +build + +# clion building directories +cmake-build-debug +cmake-build-release diff --git a/src/operators/math/Gemm.cpp b/src/operators/math/Gemm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a720d1caecfab2902fba3362083e214b4c9f0f33 --- /dev/null +++ b/src/operators/math/Gemm.cpp @@ -0,0 +1,196 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +==============================================================================*/ + +#include "operators/math/Gemm.h" +#include +#include + +namespace paddle_mobile { +namespace operators { +namespace math { +// 将A矩阵分块复制到连续内存 +void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, + float *buffer) { + int i, j; + const float *Aij; + for (i = 0; i < m - paddingM; i += MR) { + for (int j = 0; j < k; ++j) { + Aij = &A(i, j); + *buffer++ = *Aij; + *buffer++ = *(Aij + 1); + *buffer++ = *(Aij + 2); + *buffer++ = *(Aij + 3); + } + } + if (paddingM != 0) { + for (j = 0; j < k; ++j) { + Aij = &A(m - paddingM, j); + for (i = 0; i < paddingM; ++i) { + *buffer++ = *(Aij + i); + } + for (i = paddingM; i < MR; ++i) { + *buffer++ = 0; + } + } + } +} + +// 将B矩阵分块复制到连续内存 +void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, + float *buffer) { + int i, j; + for (j = 0; j < n - paddingN; j += NR) { + const float *Bj = &B(0, j); + const float *Bj1 = &B(0, j + 1); + const float *Bj2 = &B(0, j + 2); + const float *Bj3 = &B(0, j + 3); + for (i = 0; i < k; ++i) { + *buffer++ = *Bj++; + *buffer++ = *Bj1++; + *buffer++ = *Bj2++; + *buffer++ = *Bj3++; + } + } + if (paddingN != 0) { + for (i = 0; i < k; ++i) { + for (int j = n - paddingN; j < n; ++j) { + const float *Bij = &B(i, j); + *buffer++ = *Bij++; + } + for (int j = n; j < n + (NR - paddingN); ++j) { + *buffer++ = 0; + } + } + } +} + +// 分块矩阵乘法 +void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B, + int ldb, float *C, int ldc, int first_time) { + int Buff_A_M = m; + int Buff_B_N = n; + + int _mc = m % MR; + int _nc = n % NR; + + if (_mc != 0) { + Buff_A_M = m + (MR - _mc); + } + + if (_nc != 0) { + Buff_B_N = n + (NR - _nc); + } + + float packedA[MC * KC]; + static float packedB[KC * NC]; + + if (first_time) { + PackMatrixB(k, n, _nc, B, ldb, packedB); + } + PackMatrixA(m, k, _mc, A, lda, packedA); + + int i, j, mc, nc; + + // B 取 4 列, 打包预热 + for (j = 0; j < Buff_B_N; j += NR) { + nc = (n - j) < NR ? _nc : NR; + // A 取 4 行,打包预热 + for (i = 0; i < Buff_A_M; i += MR) { + mc = (m - i) < MR ? _mc : MR; + AddDot4x4(k, &packedA[i * k], 4, &packedB[j * k], k, &C(i, j), ldc, mc, + nc); + } + } +} + +// 计算一个更小的 4 * 4 的 C 矩阵分块 +void AddDot4x4(int k, const float *a, int lda, const float *b, int ldb, + float *C, int ldc, int mc, int nc) { + float c[16] = {0}; + float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3; + + // // init C + // float32x4_t cv0 = vdup_n_f32(0.0); + // float32x4_t cv1 = vdup_n_f32(0.0); + // float32x4_t cv2 = vdup_n_f32(0.0); + // float32x4_t cv3 = vdup_n_f32(0.0); + + for (int p = 0; p < k; p += 1) { + reg_b0 = *b++; + reg_b1 = *b++; + reg_b2 = *b++; + reg_b3 = *b++; + + reg_a0 = *a++; + reg_a1 = *a++; + reg_a2 = *a++; + reg_a3 = *a++; + + // first row + c[0] += reg_a0 * reg_b0; + c[1] += reg_a0 * reg_b1; + c[2] += reg_a0 * reg_b2; + c[3] += reg_a0 * reg_b3; + + // second row + c[4] += reg_a1 * reg_b0; + c[5] += reg_a1 * reg_b1; + c[6] += reg_a1 * reg_b2; + c[7] += reg_a1 * reg_b3; + + // third row + c[8] += reg_a2 * reg_b0; + c[9] += reg_a2 * reg_b1; + c[10] += reg_a2 * reg_b2; + c[11] += reg_a2 * reg_b3; + + // fourth row + c[12] += reg_a3 * reg_b0; + c[13] += reg_a3 * reg_b1; + c[14] += reg_a3 * reg_b2; + c[15] += reg_a3 * reg_b3; + } + int i, j; + for (i = 0; i < mc; ++i) { + for (j = 0; j < nc; ++j) { + C(i, j) += c[i * 4 + j]; + } + } +} + +// 32位 float 矩阵乘法 +void sgemm(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc) { + int i, j, p, mc, nc, kc; + + for (j = 0; j < n; j += NC) { + nc = min(n - j, NC); + for (p = 0; p < k; p += KC) { + kc = min(k - p, KC); + for (i = 0; i < m; i += MC) { + mc = min(m - i, MC); + InnerKernel(mc, nc, kc, &A(i, p), lda, &B(p, j), ldb, &C(i, j), ldc, + i == 0); + } + } + } +} + +} // namespace math +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/math/Gemm.h b/src/operators/math/Gemm.h new file mode 100644 index 0000000000000000000000000000000000000000..274719fdb2881a54e80501315bee7127146d5b5a --- /dev/null +++ b/src/operators/math/Gemm.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +==============================================================================*/ + +#pragma once + +// 矩阵取值运算宏,假设矩阵按列存储 +#define A(i, j) A[(j)*lda + (i)] +#define B(i, j) B[(j)*ldb + (i)] +#define C(i, j) C[(j)*ldc + (i)] + +// 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k +#define MC 384 +#define KC 384 +#define NC 4096 +#define MR 4 +#define NR 4 + +#define min(i, j) ((i) < (j) ? (i) : (j)) + +namespace paddle_mobile { +namespace operators { +namespace math { + +// 将 A 矩阵分块复制到连续内存 +void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, + float *buffer); + +// 将 B 矩阵分块复制到连续内存 +void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, + float *buffer); + +// 分块矩阵乘法 +void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B, + int ldb, float *C, int ldc, int first_time); + +// 计算一个更小的 4 * 4 的 C 矩阵分块 +void AddDot4x4(int k, const float *A, int lda, const float *B, int ldb, + float *C, int ldc, int mc, int nc); + +// 32位 float 矩阵乘法 +void sgemm(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc); + +// 64位 double 矩阵乘法 +void dgemm(int m, int n, int k, float alpha, const double *A, int lda, + const double *B, int ldb, float beta, double *C, int ldc); + +} // namespace math +} // namespace operators +} // namespace paddle_mobile diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 63e0dd819345456f04522c1e5b5c62d838b2950c..2c2abef0b996555056571922c30c76c80494451c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -47,4 +47,8 @@ target_link_libraries(test-optimize paddle-mobile) #gen test ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h framework/executor_for_test.h framework/executor_for_test.cpp) -target_link_libraries(test-pool paddle-mobile) \ No newline at end of file +target_link_libraries(test-pool paddle-mobile) + +# gen test +ADD_EXECUTABLE(test-gemm common/test_gemm.cpp) +target_link_libraries(test-gemm paddle-mobile) \ No newline at end of file diff --git a/test/common/test_gemm.cpp.cpp b/test/common/test_gemm.cpp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..599f46efe0939eb2b8caa79142f7632b92259d9a --- /dev/null +++ b/test/common/test_gemm.cpp.cpp @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +==============================================================================*/ + +#include +#include "common/log.h" +#include "operators/math/Gemm.h" + +#define a(i, j) a[(j)*lda + (i)] +#define b(i, j) b[(j)*ldb + (i)] +#define c1(i, j) c1[(j)*ldc + (i)] + +int main() { + int m = 45; + int n = 46; + int k = 125; + int lda = m; + int ldb = k; + int ldc = m; + + float a[45 * 125]; + float b[125 * 46]; + float c[45 * 46] = {0}; + float c1[45 * 46] = {0}; + for (int i = 0; i < m * k; ++i) { + a[i] = 2; + } + for (int i = 0; i < k * n; ++i) { + b[i] = 2; + } + + paddle_mobile::operators::math::sgemm(m, n, k, 1, a, lda, b, ldb, 0, c, ldc); + for (int i = 0; i < m * n; ++i) { + std::cout << c[i] << " | "; + if (i % n == (n - 1)) { + std::cout << std::endl; + } + } + for (int j = 0; j < n; ++j) { + for (int i = 0; i < m; ++i) { + for (int p = 0; p < k; ++p) { + c1(i, j) += a(i, p) * b(p, j); + } + } + } + std::cout << "正确结果对比:" << std::endl; + for (int i = 0; i < m * n; ++i) { + std::cout << c1[i] << " | "; + if (i % n == (n - 1)) { + std::cout << std::endl; + } + } + return 0; +}