提交 8f5b0311 编写于 作者: Z zhaojiaying01

replace openplas with gemm

上级 e639ef38
......@@ -34,8 +34,6 @@ if (ANDROID)
link_directories(third-party/protobuf/armeabi-v7a)
else()
# link openblas
include_directories(third-party/openblas/include)
link_directories(third-party/openblas/lib)
link_directories(third-party/protobuf/lib)
endif ()
......@@ -47,7 +45,7 @@ if (ANDROID)
# openblas.a need log lib
target_link_libraries(paddle-mobile protobuf-lite)
else()
target_link_libraries(paddle-mobile protobuf-lite openblas)
target_link_libraries(paddle-mobile protobuf-lite)
endif ()
#add_dependencies(paddle-mobile openblas_proj)
......
......@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/Gemm.h"
#include <iostream>
#include "operators/math/gemm.h"
namespace paddle_mobile {
namespace operators {
namespace math {
// 将A矩阵分块复制到连续内存
// 将A矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
float *buffer) {
int i, j;
......@@ -45,15 +44,45 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
}
}
// 将B矩阵分块复制到连续内存
// 将A矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
float *buffer) {
int i, j;
const float *Ai, *Ai1, *Ai2, *Ai3;
for (i = 0; i < m - paddingM; i += MR) {
Ai = &A(i, 0);
Ai1 = &A(i + 1, 0);
Ai2 = &A(i + 2, 0);
Ai3 = &A(i + 3, 0);
for (int j = 0; j < k; ++j) {
*buffer++ = *Ai++;
*buffer++ = *Ai1++;
*buffer++ = *Ai2++;
*buffer++ = *Ai3++;
}
}
if (paddingM != 0) {
for (j = 0; j < k; ++j) {
for (i = m - paddingM; i < m; ++i) {
*buffer++ = A(i, j);
}
for (i = m; i < m + (MR - paddingM); ++i) {
*buffer++ = 0;
}
}
}
}
// 将B矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
float *buffer) {
int i, j;
const float *Bj, *Bj1, *Bj2, *Bj3;
for (j = 0; j < n - paddingN; j += NR) {
const float *Bj = &B(0, j);
const float *Bj1 = &B(0, j + 1);
const float *Bj2 = &B(0, j + 2);
const float *Bj3 = &B(0, j + 3);
Bj = &B(0, j);
Bj1 = &B(0, j + 1);
Bj2 = &B(0, j + 2);
Bj3 = &B(0, j + 3);
for (i = 0; i < k; ++i) {
*buffer++ = *Bj++;
*buffer++ = *Bj1++;
......@@ -64,7 +93,33 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
if (paddingN != 0) {
for (i = 0; i < k; ++i) {
for (int j = n - paddingN; j < n; ++j) {
const float *Bij = &B(i, j);
*buffer++ = B(i, j);
}
for (int j = n; j < n + (NR - paddingN); ++j) {
*buffer++ = 0;
}
}
}
}
// 将B矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
float *buffer) {
int i, j;
const float *Bij;
for (j = 0; j < n - paddingN; j += NR) {
for (i = 0; i < k; ++i) {
Bij = &B(i, j);
*buffer++ = *Bij;
*buffer++ = *(Bij + 1);
*buffer++ = *(Bij + 2);
*buffer++ = *(Bij + 3);
}
}
if (paddingN != 0) {
for (i = 0; i < k; ++i) {
Bij = &B(i, n - paddingN);
for (int j = n - paddingN; j < n; ++j) {
*buffer++ = *Bij++;
}
for (int j = n; j < n + (NR - paddingN); ++j) {
......@@ -95,9 +150,9 @@ void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B,
static float packedB[KC * NC];
if (first_time) {
PackMatrixB(k, n, _nc, B, ldb, packedB);
PackMatrixB_(k, n, _nc, B, ldb, packedB);
}
PackMatrixA(m, k, _mc, A, lda, packedA);
PackMatrixA_(m, k, _mc, A, lda, packedA);
int i, j, mc, nc;
......
......@@ -14,10 +14,10 @@ limitations under the License. */
#pragma once
// 矩阵取值运算宏,假设矩阵按存储
#define A(i, j) A[(j)*lda + (i)]
#define B(i, j) B[(j)*ldb + (i)]
#define C(i, j) C[(j)*ldc + (i)]
// 矩阵取值运算宏,假设矩阵按存储
#define A(i, j) A[(i)*lda + (j)]
#define B(i, j) B[(i)*ldb + (j)]
#define C(i, j) C[(i)*ldc + (j)]
// 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k
#define MC 384
......@@ -32,14 +32,22 @@ namespace paddle_mobile {
namespace operators {
namespace math {
// 将 A 矩阵分块复制到连续内存
// 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
float *buffer);
// 将 B 矩阵分块复制到连续内存
// 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
float *buffer);
// 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
float *buffer);
// 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
float *buffer);
// 分块矩阵乘法
void InnerKernel(int m, int n, int k, const float *A, int lda, const float *B,
int ldb, float *C, int ldc, int first_time);
......
......@@ -13,54 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/math_function.h"
#include "operators/math/gemm.h"
namespace paddle_mobile {
namespace operators {
namespace math {
template <>
void gemm<float>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
const int M, const int N, const int K, const float alpha,
const float *A, const float *B, const float beta, float *C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
const int M, const int N, const int K, const double alpha,
const double *A, const double *B, const double beta,
double *C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
void gemm<float>(const bool transA, const bool transB, const int M, const int N,
const int K, const float alpha, const float *A, const int lda,
const float *B, const int ldb, const float beta, float *C,
const int ldc) {
cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc);
}
template <>
void gemm<double>(const bool transA, const bool transB, const int M,
const int N, const int K, const double alpha, const double *A,
const int lda, const double *B, const int ldb,
const double beta, double *C, const int ldc) {
cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc);
}
template <>
void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha,
......@@ -83,11 +41,8 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<float>(transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>());
sgemm(M, N, K, 1, matrix_a.data<float>(), K, matrix_b.data<float>(), N, 0,
matrix_out->data<float>(), N);
}
template <>
......@@ -111,12 +66,6 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
int M = dim_out[0];
int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<double>(transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
} // namespace math
......
......@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once
#include <cblas.h>
#include <cmath>
#include "framework/tensor.h"
......@@ -22,16 +21,6 @@ namespace paddle_mobile {
namespace operators {
namespace math {
template <typename T>
void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
const int M, const int N, const int K, const T alpha, const T *A,
const T *B, const T beta, T *C);
template <typename T>
void gemm(const bool transA, const bool transB, const int M, const int N,
const int K, const T alpha, const T *A, const int lda, const T *B,
const int ldb, const T beta, T *C, const int ldc);
// matrix multiply with continuous memory
template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
......
......@@ -14,24 +14,25 @@ limitations under the License. */
#include <iostream>
#include "common/log.h"
#include "operators/math/Gemm.h"
#include "operators/math/gemm.h"
#define a(i, j) a[(j)*lda + (i)]
#define b(i, j) b[(j)*ldb + (i)]
#define c1(i, j) c1[(j)*ldc + (i)]
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
#define m 7
#define n 7
#define k 7
int main() {
int m = 45;
int n = 46;
int k = 125;
int lda = m;
int ldb = k;
int ldc = m;
int lda = k;
int ldb = n;
int ldc = n;
float a[45 * 125];
float b[125 * 46];
float c[45 * 46] = {0};
float c1[45 * 46] = {0};
float a[7 * 7];
float b[7 * 7];
float c[7 * 7] = {0};
float c1[7 * 7] = {0};
for (int i = 0; i < m * k; ++i) {
a[i] = 2;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册