add Gemm library

32c37cf3 · zhaojiaying01 · 4e063a2c · 32c37cf3 · 32c37cf3 · 32c37cf3
4 changed file
--- a/src/operators/math/Gemm.cpp
+++ b/src/operators/math/Gemm.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+==============================================================================*/
+#include "operators/math/Gemm.h"
+#include <algorithm>
+#include <iostream>
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+// 将A矩阵分块复制到连续内存
+void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
+                 float *buffer) {
+  int i, j;
+  const float *Aij;
+  for (i = 0; i < m - paddingM; i += MR) {
+    for (int j = 0; j < k; ++j) {
+      Aij = &A(i, j);
+      *buffer++ = *Aij;
+      *buffer++ = *(Aij + 1);
+      *buffer++ = *(Aij + 2);
+      *buffer++ = *(Aij + 3);
+    }
+  }
+  if (paddingM != 0) {
+    for (j = 0; j < k; ++j) {
+      Aij = &A(m - paddingM, j);
+      for (i = 0; i < paddingM; ++i) {
+        *buffer++ = *(Aij + i);
+      }
+      for (i = paddingM; i < MR; ++i) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+// 将B矩阵分块复制到连续内存
+void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
+                 float *buffer) {
+  int i, j;
+  for (j = 0; j < n - paddingN; j += NR) {
+    const float *Bj = &B(0, j);
+    const float *Bj1 = &B(0, j + 1);
+    const float *Bj2 = &B(0, j + 2);
+    const float *Bj3 = &B(0, j + 3);
+    for (i = 0; i < k; ++i) {
+      *buffer++ = *Bj++;
+      *buffer++ = *Bj1++;
+      *buffer++ = *Bj2++;
+      *buffer++ = *Bj3++;
+    }
+  }
+  if (paddingN != 0) {
+    for (i = 0; i < k; ++i) {
+      for (int j = n - paddingN; j < n; ++j) {
+        const float *Bij = &B(i, j);
+        *buffer++ = *Bij++;
+      }
+      for (int j = n; j < n + (NR - paddingN); ++j) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+// 分块矩阵乘法
+void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B,
+                 int ldb, float *C, int ldc, int first_time) {
+  int Buff_A_M = m;
+  int Buff_B_N = n;
+  int _mc = m % MR;
+  int _nc = n % NR;
+  if (_mc != 0) {
+    Buff_A_M = m + (MR - _mc);
+  }
+  if (_nc != 0) {
+    Buff_B_N = n + (NR - _nc);
+  }
+  float packedA[MC * KC];
+  static float packedB[KC * NC];
+  if (first_time) {
+    PackMatrixB(k, n, _nc, B, ldb, packedB);
+  }
+  PackMatrixA(m, k, _mc, A, lda, packedA);
+  int i, j, mc, nc;
+  // B 取 4 列, 打包预热
+  for (j = 0; j < Buff_B_N; j += NR) {
+    nc = (n - j) < NR ? _nc : NR;
+    // A 取 4 行，打包预热
+    for (i = 0; i < Buff_A_M; i += MR) {
+      mc = (m - i) < MR ? _mc : MR;
+      AddDot4x4(k, &packedA[i * k], 4, &packedB[j * k], k, &C(i, j), ldc, mc,
+                nc);
+    }
+  }
+}
+// 计算一个更小的 4 * 4 的 C 矩阵分块
+void AddDot4x4(int k, const float *a, int lda, const float *b, int ldb,
+               float *C, int ldc, int mc, int nc) {
+  float c[16] = {0};
+  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
+  // // init C
+  // float32x4_t cv0 = vdup_n_f32(0.0);
+  // float32x4_t cv1 = vdup_n_f32(0.0);
+  // float32x4_t cv2 = vdup_n_f32(0.0);
+  // float32x4_t cv3 = vdup_n_f32(0.0);
+  for (int p = 0; p < k; p += 1) {
+    reg_b0 = *b++;
+    reg_b1 = *b++;
+    reg_b2 = *b++;
+    reg_b3 = *b++;
+    reg_a0 = *a++;
+    reg_a1 = *a++;
+    reg_a2 = *a++;
+    reg_a3 = *a++;
+    // first row
+    c[0] += reg_a0 * reg_b0;
+    c[1] += reg_a0 * reg_b1;
+    c[2] += reg_a0 * reg_b2;
+    c[3] += reg_a0 * reg_b3;
+    // second row
+    c[4] += reg_a1 * reg_b0;
+    c[5] += reg_a1 * reg_b1;
+    c[6] += reg_a1 * reg_b2;
+    c[7] += reg_a1 * reg_b3;
+    // third row
+    c[8] += reg_a2 * reg_b0;
+    c[9] += reg_a2 * reg_b1;
+    c[10] += reg_a2 * reg_b2;
+    c[11] += reg_a2 * reg_b3;
+    // fourth row
+    c[12] += reg_a3 * reg_b0;
+    c[13] += reg_a3 * reg_b1;
+    c[14] += reg_a3 * reg_b2;
+    c[15] += reg_a3 * reg_b3;
+  }
+  int i, j;
+  for (i = 0; i < mc; ++i) {
+    for (j = 0; j < nc; ++j) {
+      C(i, j) += c[i * 4 + j];
+    }
+  }
+}
+// 32位 float 矩阵乘法
+void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc) {
+  int i, j, p, mc, nc, kc;
+  for (j = 0; j < n; j += NC) {
+    nc = min(n - j, NC);
+    for (p = 0; p < k; p += KC) {
+      kc = min(k - p, KC);
+      for (i = 0; i < m; i += MC) {
+        mc = min(m - i, MC);
+        InnerKernel(mc, nc, kc, &A(i, p), lda, &B(p, j), ldb, &C(i, j), ldc,
+                    i == 0);
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/Gemm.h
+++ b/src/operators/math/Gemm.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+==============================================================================*/
+#pragma once
+// 矩阵取值运算宏，假设矩阵按列存储
+#define A(i, j) A[(j)*lda + (i)]
+#define B(i, j) B[(j)*ldb + (i)]
+#define C(i, j) C[(j)*ldc + (i)]
+// 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
+#define MC 384
+#define KC 384
+#define NC 4096
+#define MR 4
+#define NR 4
+#define min(i, j) ((i) < (j) ? (i) : (j))
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+// 将 A 矩阵分块复制到连续内存
+void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
+                 float *buffer);
+// 将 B 矩阵分块复制到连续内存
+void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
+                 float *buffer);
+// 分块矩阵乘法
+void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B,
+                 int ldb, float *C, int ldc, int first_time);
+// 计算一个更小的 4 * 4 的 C 矩阵分块
+void AddDot4x4(int k, const float *A, int lda, const float *B, int ldb,
+               float *C, int ldc, int mc, int nc);
+// 32位 float 矩阵乘法
+void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc);
+// 64位 double 矩阵乘法
+void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
+           const double *B, int ldb, float beta, double *C, int ldc);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -47,4 +47,8 @@ target_link_libraries(test-optimize paddle-mobile)
 #gen test
 ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h framework/executor_for_test.h framework/executor_for_test.cpp)
 target_link_libraries(test-pool paddle-mobile)
\ No newline at end of file
+# gen test
+ADD_EXECUTABLE(test-gemm common/test_gemm.cpp)
+target_link_libraries(test-gemm paddle-mobile)
\ No newline at end of file
--- a/test/common/test_gemm.cpp.cpp
+++ b/test/common/test_gemm.cpp.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+==============================================================================*/
+#include <iostream>
+#include "common/log.h"
+#include "operators/math/Gemm.h"
+#define a(i, j) a[(j)*lda + (i)]
+#define b(i, j) b[(j)*ldb + (i)]
+#define c1(i, j) c1[(j)*ldc + (i)]
+int main() {
+  int m = 45;
+  int n = 46;
+  int k = 125;
+  int lda = m;
+  int ldb = k;
+  int ldc = m;
+  float a[45 * 125];
+  float b[125 * 46];
+  float c[45 * 46] = {0};
+  float c1[45 * 46] = {0};
+  for (int i = 0; i < m * k; ++i) {
+    a[i] = 2;
+  }
+  for (int i = 0; i < k * n; ++i) {
+    b[i] = 2;
+  }
+  paddle_mobile::operators::math::sgemm(m, n, k, 1, a, lda, b, ldb, 0, c, ldc);
+  for (int i = 0; i < m * n; ++i) {
+    std::cout << c[i] << " | ";
+    if (i % n == (n - 1)) {
+      std::cout << std::endl;
+    }
+  }
+  for (int j = 0; j < n; ++j) {
+    for (int i = 0; i < m; ++i) {
+      for (int p = 0; p < k; ++p) {
+        c1(i, j) += a(i, p) * b(p, j);
+      }
+    }
+  }
+  std::cout << "正确结果对比:" << std::endl;
+  for (int i = 0; i < m * n; ++i) {
+    std::cout << c1[i] << " | ";
+    if (i % n == (n - 1)) {
+      std::cout << std::endl;
+    }
+  }
+  return 0;
+}