提交 061e4ca6 编写于 作者: S smilejames 提交者: GitHub

Merge pull request #259 from smilejames/develop

replace openplas with gemm
...@@ -34,8 +34,6 @@ if (ANDROID) ...@@ -34,8 +34,6 @@ if (ANDROID)
link_directories(third-party/protobuf/armeabi-v7a) link_directories(third-party/protobuf/armeabi-v7a)
else() else()
# link openblas # link openblas
include_directories(third-party/openblas/include)
link_directories(third-party/openblas/lib)
link_directories(third-party/protobuf/lib) link_directories(third-party/protobuf/lib)
endif () endif ()
...@@ -47,7 +45,7 @@ if (ANDROID) ...@@ -47,7 +45,7 @@ if (ANDROID)
# openblas.a need log lib # openblas.a need log lib
target_link_libraries(paddle-mobile protobuf-lite) target_link_libraries(paddle-mobile protobuf-lite)
else() else()
target_link_libraries(paddle-mobile protobuf-lite openblas) target_link_libraries(paddle-mobile protobuf-lite)
endif () endif ()
#add_dependencies(paddle-mobile openblas_proj) #add_dependencies(paddle-mobile openblas_proj)
......
...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/math/Gemm.h" #include "operators/math/gemm.h"
#include <iostream>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
// 将A矩阵分块复制到连续内存 // 将A矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
float *buffer) { float *buffer) {
int i, j; int i, j;
...@@ -45,15 +44,45 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, ...@@ -45,15 +44,45 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
} }
} }
// 将B矩阵分块复制到连续内存 // 将A矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
float *buffer) {
int i, j;
const float *Ai, *Ai1, *Ai2, *Ai3;
for (i = 0; i < m - paddingM; i += MR) {
Ai = &A(i, 0);
Ai1 = &A(i + 1, 0);
Ai2 = &A(i + 2, 0);
Ai3 = &A(i + 3, 0);
for (int j = 0; j < k; ++j) {
*buffer++ = *Ai++;
*buffer++ = *Ai1++;
*buffer++ = *Ai2++;
*buffer++ = *Ai3++;
}
}
if (paddingM != 0) {
for (j = 0; j < k; ++j) {
for (i = m - paddingM; i < m; ++i) {
*buffer++ = A(i, j);
}
for (i = m; i < m + (MR - paddingM); ++i) {
*buffer++ = 0;
}
}
}
}
// 将B矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
float *buffer) { float *buffer) {
int i, j; int i, j;
const float *Bj, *Bj1, *Bj2, *Bj3;
for (j = 0; j < n - paddingN; j += NR) { for (j = 0; j < n - paddingN; j += NR) {
const float *Bj = &B(0, j); Bj = &B(0, j);
const float *Bj1 = &B(0, j + 1); Bj1 = &B(0, j + 1);
const float *Bj2 = &B(0, j + 2); Bj2 = &B(0, j + 2);
const float *Bj3 = &B(0, j + 3); Bj3 = &B(0, j + 3);
for (i = 0; i < k; ++i) { for (i = 0; i < k; ++i) {
*buffer++ = *Bj++; *buffer++ = *Bj++;
*buffer++ = *Bj1++; *buffer++ = *Bj1++;
...@@ -64,7 +93,33 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, ...@@ -64,7 +93,33 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
if (paddingN != 0) { if (paddingN != 0) {
for (i = 0; i < k; ++i) { for (i = 0; i < k; ++i) {
for (int j = n - paddingN; j < n; ++j) { for (int j = n - paddingN; j < n; ++j) {
const float *Bij = &B(i, j); *buffer++ = B(i, j);
}
for (int j = n; j < n + (NR - paddingN); ++j) {
*buffer++ = 0;
}
}
}
}
// 将B矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
float *buffer) {
int i, j;
const float *Bij;
for (j = 0; j < n - paddingN; j += NR) {
for (i = 0; i < k; ++i) {
Bij = &B(i, j);
*buffer++ = *Bij;
*buffer++ = *(Bij + 1);
*buffer++ = *(Bij + 2);
*buffer++ = *(Bij + 3);
}
}
if (paddingN != 0) {
for (i = 0; i < k; ++i) {
Bij = &B(i, n - paddingN);
for (int j = n - paddingN; j < n; ++j) {
*buffer++ = *Bij++; *buffer++ = *Bij++;
} }
for (int j = n; j < n + (NR - paddingN); ++j) { for (int j = n; j < n + (NR - paddingN); ++j) {
...@@ -95,9 +150,9 @@ void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B, ...@@ -95,9 +150,9 @@ void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B,
static float packedB[KC * NC]; static float packedB[KC * NC];
if (first_time) { if (first_time) {
PackMatrixB(k, n, _nc, B, ldb, packedB); PackMatrixB_(k, n, _nc, B, ldb, packedB);
} }
PackMatrixA(m, k, _mc, A, lda, packedA); PackMatrixA_(m, k, _mc, A, lda, packedA);
int i, j, mc, nc; int i, j, mc, nc;
......
...@@ -14,10 +14,10 @@ limitations under the License. */ ...@@ -14,10 +14,10 @@ limitations under the License. */
#pragma once #pragma once
// 矩阵取值运算宏,假设矩阵按存储 // 矩阵取值运算宏,假设矩阵按存储
#define A(i, j) A[(j)*lda + (i)] #define A(i, j) A[(i)*lda + (j)]
#define B(i, j) B[(j)*ldb + (i)] #define B(i, j) B[(i)*ldb + (j)]
#define C(i, j) C[(j)*ldc + (i)] #define C(i, j) C[(i)*ldc + (j)]
// 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k // 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k
#define MC 384 #define MC 384
...@@ -32,14 +32,22 @@ namespace paddle_mobile { ...@@ -32,14 +32,22 @@ namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
// 将 A 矩阵分块复制到连续内存 // 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
float *buffer); float *buffer);
// 将 B 矩阵分块复制到连续内存 // 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
float *buffer); float *buffer);
// 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
float *buffer);
// 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
float *buffer);
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B, void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B,
int ldb, float *C, int ldc, int first_time); int ldb, float *C, int ldc, int first_time);
......
...@@ -13,54 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,54 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include "operators/math/gemm.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
template <>
void gemm<float>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
const int M, const int N, const int K, const float alpha,
const float *A, const float *B, const float beta, float *C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
const int M, const int N, const int K, const double alpha,
const double *A, const double *B, const double beta,
double *C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
void gemm<float>(const bool transA, const bool transB, const int M, const int N,
const int K, const float alpha, const float *A, const int lda,
const float *B, const int ldb, const float beta, float *C,
const int ldc) {
cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc);
}
template <>
void gemm<double>(const bool transA, const bool transB, const int M,
const int N, const int K, const double alpha, const double *A,
const int lda, const double *B, const int ldb,
const double beta, double *C, const int ldc) {
cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc);
}
template <> template <>
void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
...@@ -83,11 +41,8 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -83,11 +41,8 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int N = dim_out[1]; int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0]; int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; sgemm(M, N, K, 1, matrix_a.data<float>(), K, matrix_b.data<float>(), N, 0,
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; matrix_out->data<float>(), N);
gemm<float>(transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>());
} }
template <> template <>
...@@ -111,12 +66,6 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -111,12 +66,6 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0]; int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<double>(transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>());
} }
} // namespace math } // namespace math
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#include <cblas.h>
#include <cmath> #include <cmath>
#include "framework/tensor.h" #include "framework/tensor.h"
...@@ -22,16 +21,6 @@ namespace paddle_mobile { ...@@ -22,16 +21,6 @@ namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename T>
void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
const int M, const int N, const int K, const T alpha, const T *A,
const T *B, const T beta, T *C);
template <typename T>
void gemm(const bool transA, const bool transB, const int M, const int N,
const int K, const T alpha, const T *A, const int lda, const T *B,
const int ldb, const T beta, T *C, const int ldc);
// matrix multiply with continuous memory // matrix multiply with continuous memory
template <typename T> template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void matmul(const framework::Tensor &matrix_a, bool trans_a,
......
...@@ -14,24 +14,25 @@ limitations under the License. */ ...@@ -14,24 +14,25 @@ limitations under the License. */
#include <iostream> #include <iostream>
#include "common/log.h" #include "common/log.h"
#include "operators/math/Gemm.h" #include "operators/math/gemm.h"
#define a(i, j) a[(j)*lda + (i)] #define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(j)*ldb + (i)] #define b(i, j) b[(i)*ldb + (j)]
#define c1(i, j) c1[(j)*ldc + (i)] #define c1(i, j) c1[(i)*ldc + (j)]
#define m 7
#define n 7
#define k 7
int main() { int main() {
int m = 45; int lda = k;
int n = 46; int ldb = n;
int k = 125; int ldc = n;
int lda = m;
int ldb = k;
int ldc = m;
float a[45 * 125]; float a[7 * 7];
float b[125 * 46]; float b[7 * 7];
float c[45 * 46] = {0}; float c[7 * 7] = {0};
float c1[45 * 46] = {0}; float c1[7 * 7] = {0};
for (int i = 0; i < m * k; ++i) { for (int i = 0; i < m * k; ++i) {
a[i] = 2; a[i] = 2;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册