Merge pull request #724 from smilejames/develop

PaddleMobile OPENMP 第一个多线程版本

Merge pull request #724 from smilejames/develop
PaddleMobile OPENMP 第一个多线程版本
0db5565f · Ruilong Liu · GitHub · 0fa8026d · 8ee1a52e · 0db5565f
Showing with 600 addition and 99 deletion

src/operators/math/gemm.cpp src/operators/math/gemm.cpp +568 -99

src/operators/math/gemm.h src/operators/math/gemm.h +20 -0

src/operators/math/math_function.cpp src/operators/math/math_function.cpp +12 -0

未找到文件。
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -50,6 +50,10 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
                    float *buffer);
 void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
                    float *buffer);
+void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
+                        float *buffer);
+void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
+                        float *buffer);
 // 将 B 矩阵分块复制到连续内存(RowMajor)
 void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
@@ -58,6 +62,12 @@ void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
                     float *buffer);
 void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
                     float *buffer);
+void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
+                        float *buffer);
+void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
+                         float *buffer);
+void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
+                         float *buffer);
 // 分块矩阵乘法
 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
@@ -136,6 +146,16 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
                 const float *B, int ldb, float beta, float *C, int ldc,
                 bool relu, float *new_scale, float *new_bias);
+// 32位 float 矩阵乘法（openmp 多线程版本）
+void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
+               const float *B, int ldb, float beta, float *C, int ldc,
+               bool relu, float *bias);
+// 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
+void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
+                     const float *B, int ldb, float beta, float *C, int ldc,
+                     bool relu, float *new_scale, float *new_bias);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -42,8 +42,13 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
  int N = dim_out[1];
  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+#ifdef _OPENMP
+  Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+            N, beta, matrix_out->data<float>(), N, relu, bias);
+#else
  Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
        beta, matrix_out->data<float>(), N, relu, bias);
+#endif
 }
 template <>
@@ -70,10 +75,17 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
  int N = dim_out[1];
  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+#ifdef _OPENMP
+  SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K,
+                  matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
+                  relu, new_scale->data<float>() + group,
+                  new_bias->data<float>() + group);
+#else
  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
              N, beta, matrix_out->data<float>(), N, relu,
              new_scale->data<float>() + group,
              new_bias->data<float>() + group);
+#endif
 }
 }  // namespace math