未验证 提交 fdbeb280 编写于 作者: E eclipsycn 提交者: GitHub

Merge branch 'develop' into develop

...@@ -409,9 +409,6 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict( ...@@ -409,9 +409,6 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void Executor<Dtype, P>::SetThreadNum(int num) { void Executor<Dtype, P>::SetThreadNum(int num) {
for (int k = 0; k < std::max(num, 3); ++k) {
operators::math::Gemmer::gemmers.push_back(new operators::math::Gemmer());
}
#ifdef _OPENMP #ifdef _OPENMP
// omp_set_dynamic(0); // omp_set_dynamic(0);
omp_set_num_threads(num); omp_set_num_threads(num);
......
...@@ -14,14 +14,10 @@ limitations under the License. */ ...@@ -14,14 +14,10 @@ limitations under the License. */
#ifdef FUSION_CONVADD_OP #ifdef FUSION_CONVADD_OP
#pragma once #pragma once
#if _OPENMP
#include <omp.h>
#endif
#include <vector> #include <vector>
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv_3x3.h" #include "operators/math/depthwise_conv_3x3.h"
#include "operators/math/gemm.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include "operators/math/vol2col.h" #include "operators/math/vol2col.h"
...@@ -110,33 +106,9 @@ void ConvAddBasic(const FusionConvAddParam &param) { ...@@ -110,33 +106,9 @@ void ConvAddBasic(const FusionConvAddParam &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
auto dim_a = filter_slice.dims(); static_cast<float>(1), &out_slice,
auto dim_b = col_matrix.dims(); static_cast<float>(1));
auto dim_out = out_slice.dims();
int m = dim_out[0];
int n = dim_out[1];
int k = dim_a[1];
float *output_data = out_slice.data<float>();
int thread_num = 4;
int m1 = m / thread_num;
int m2 = m % thread_num;
#pragma omp parallel for
for (int j = 0; j < thread_num; ++j) {
int row_count = m1;
if (j == thread_num - 1) {
row_count = m1 + m2;
}
math::Gemmer::gemmers[j]->Sgemm(
row_count, n, k, 1, filter_slice.data<float>() + j * m1 * k, k,
col_matrix.data<float>(), n, 1, output_data + j * m1 * n, n, false);
}
// math::matmul<float>(filter_slice, false, col_matrix, false,
// static_cast<float>(1), &out_slice,
// static_cast<float>(1));
} }
} }
} }
......
...@@ -22,11 +22,17 @@ limitations under the License. */ ...@@ -22,11 +22,17 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
int MC = 0;
std::vector<Gemmer *> Gemmer::gemmers; int KC = 0;
int NC = 0;
float *packedA;
float *packedB;
float *packedC;
float *zero;
// 将A矩阵分块复制到连续内存(ColMajor) // 将A矩阵分块复制到连续内存(ColMajor)
void Gemmer::PackMatrixA(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
int i, j; int i, j;
const float *Aij; const float *Aij;
for (i = 0; i < m - m_tail; i += MR) { for (i = 0; i < m - m_tail; i += MR) {
...@@ -52,8 +58,8 @@ void Gemmer::PackMatrixA(int m, int k, int m_tail, const float *A, int lda, ...@@ -52,8 +58,8 @@ void Gemmer::PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
} }
// 将A矩阵分块复制到连续内存(RowMajor) // 将A矩阵分块复制到连续内存(RowMajor)
void Gemmer::PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const float *a0, *a1, *a2, *a3; const float *a0, *a1, *a2, *a3;
for (int i = 0; i < m - m_tail; i += MR) { for (int i = 0; i < m - m_tail; i += MR) {
a0 = A + i * lda; a0 = A + i * lda;
...@@ -92,8 +98,8 @@ void Gemmer::PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, ...@@ -92,8 +98,8 @@ void Gemmer::PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
} }
// 将B矩阵分块复制到连续内存(ColMajor) // 将B矩阵分块复制到连续内存(ColMajor)
void Gemmer::PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
int i, j; int i, j;
const float *Bj, *Bj1, *Bj2, *Bj3; const float *Bj, *Bj1, *Bj2, *Bj3;
for (j = 0; j < n - n_tail; j += NR) { for (j = 0; j < n - n_tail; j += NR) {
...@@ -121,8 +127,8 @@ void Gemmer::PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -121,8 +127,8 @@ void Gemmer::PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
} }
// 将B矩阵分块复制到连续内存(RowMajor) // 将B矩阵分块复制到连续内存(RowMajor)
void Gemmer::PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const float *b0; const float *b0;
for (int j = 0; j < n - n_tail; j += NR) { for (int j = 0; j < n - n_tail; j += NR) {
for (int i = 0; i < k; ++i) { for (int i = 0; i < k; ++i) {
...@@ -150,9 +156,8 @@ void Gemmer::PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, ...@@ -150,9 +156,8 @@ void Gemmer::PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void Gemmer::InnerKernel(int mc, int nc, float alpha, const float *a, void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
const float *b, float beta, float *c, float *C, float beta, float *c, float *C, int ldc, bool relu) {
int ldc, bool relu) {
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
...@@ -179,10 +184,9 @@ void Gemmer::InnerKernel(int mc, int nc, float alpha, const float *a, ...@@ -179,10 +184,9 @@ void Gemmer::InnerKernel(int mc, int nc, float alpha, const float *a,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void Gemmer::InnerKernelWithBn(int mc, int nc, float alpha, const float *a, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C, int ldc,
int ldc, bool relu, float *new_scale, bool relu, float *new_scale, float *new_bias) {
float *new_bias) {
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
...@@ -198,8 +202,7 @@ void Gemmer::InnerKernelWithBn(int mc, int nc, float alpha, const float *a, ...@@ -198,8 +202,7 @@ void Gemmer::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
} }
#if defined(IOS) #if defined(IOS)
void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *C, void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
int ldc) {
// init C // init C
float32x4_t cv0 = vdupq_n_f32(0.0); float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0); float32x4_t cv1 = vdupq_n_f32(0.0);
...@@ -250,8 +253,7 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *C, ...@@ -250,8 +253,7 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *C,
} // namespace math } // namespace math
#elif defined(ARMV7) #elif defined(ARMV7)
void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c, void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -322,8 +324,7 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c, ...@@ -322,8 +324,7 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
} }
#else #else
void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c, void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
int ldc) {
float *c0, *c1, *c2, *c3; float *c0, *c1, *c2, *c3;
c0 = c; c0 = c;
c1 = c + ldc; c1 = c + ldc;
...@@ -362,9 +363,8 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c, ...@@ -362,9 +363,8 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
#endif #endif
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Gemmer::Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
bool relu) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 30 * 1024; int L1 = 30 * 1024;
...@@ -415,10 +415,9 @@ void Gemmer::Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -415,10 +415,9 @@ void Gemmer::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void Gemmer::SgemmWithBn(int m, int n, int k, float alpha, const float *A, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
int lda, const float *B, int ldb, float beta, float *C, const float *B, int ldb, float beta, float *C, int ldc,
int ldc, bool relu, float *new_scale, bool relu, float *new_scale, float *new_bias) {
float *new_bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 30 * 1024; int L1 = 30 * 1024;
...@@ -469,9 +468,9 @@ void Gemmer::SgemmWithBn(int m, int n, int k, float alpha, const float *A, ...@@ -469,9 +468,9 @@ void Gemmer::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void Gemmer::VectorKernel(int m, int n, int k, float alpha, const float *A, void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
int lda, const float *B, int ldb, float beta, const float *B, int ldb, float beta, float *C, int ldc,
float *C, int ldc, bool relu) { bool relu) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n)); float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3; const float *a0, *b0, *b1, *b2, *b3;
...@@ -691,10 +690,9 @@ void Gemmer::VectorKernel(int m, int n, int k, float alpha, const float *A, ...@@ -691,10 +690,9 @@ void Gemmer::VectorKernel(int m, int n, int k, float alpha, const float *A,
} }
} }
void Gemmer::VectorKernelWithBn(int m, int n, int k, float alpha, void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
const float *A, int lda, const float *B, int lda, const float *B, int ldb, float beta, float *C,
int ldb, float beta, float *C, int ldc, int ldc, bool relu, float *new_scale, float *new_bias) {
bool relu, float *new_scale, float *new_bias) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n)); float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3; const float *a0, *b0, *b1, *b2, *b3;
...@@ -903,8 +901,7 @@ void Gemmer::VectorKernelWithBn(int m, int n, int k, float alpha, ...@@ -903,8 +901,7 @@ void Gemmer::VectorKernelWithBn(int m, int n, int k, float alpha,
} }
} }
void Gemmer::AddDot4x8(int k, const float *a, const float *b, float *c, void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -1012,7 +1009,7 @@ void Gemmer::AddDot4x8(int k, const float *a, const float *b, float *c, ...@@ -1012,7 +1009,7 @@ void Gemmer::AddDot4x8(int k, const float *a, const float *b, float *c,
} }
// C = A * B // C = A * B
void Gemmer::WriteBasic(int mc, int nc, float *c, float *C, int ldc) { void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1069,10 +1066,10 @@ void Gemmer::WriteBasic(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1069,10 +1066,10 @@ void Gemmer::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void Gemmer::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void Gemmer::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1136,7 +1133,7 @@ void Gemmer::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1136,7 +1133,7 @@ void Gemmer::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void Gemmer::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1210,14 +1207,14 @@ void Gemmer::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1210,14 +1207,14 @@ void Gemmer::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void Gemmer::WriteWithBn(int mc, int nc, float *c, float *C, int ldc, void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
float *scale, float *bias) { float *bias) {
int nc1 = nc / 16; int volatile nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int nc2 = _nc1 / 4; int volatile nc2 = _nc1 / 4;
int nc3 = 16 - 4 * (_nc1 % 4); int volatile nc3 = 16 - 4 * (_nc1 % 4);
int step = 4 * (ldc - nc); int volatile step = 4 * (ldc - nc);
int step1 = 4 * (NC - nc); int volatile step1 = 4 * (NC - nc);
asm volatile( asm volatile(
"subs %[mc], %[mc], #1 \n\t" "subs %[mc], %[mc], #1 \n\t"
...@@ -1296,8 +1293,8 @@ void Gemmer::WriteWithBn(int mc, int nc, float *c, float *C, int ldc, ...@@ -1296,8 +1293,8 @@ void Gemmer::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void Gemmer::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
float *scale, float *bias) { float *bias) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -1389,7 +1386,7 @@ void Gemmer::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -1389,7 +1386,7 @@ void Gemmer::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B // C = A * B
void Gemmer::VecWriteBasic(int n, float *c, float *C, int ldc) { void VecWriteBasic(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -1435,10 +1432,10 @@ void Gemmer::VecWriteBasic(int n, float *c, float *C, int ldc) { ...@@ -1435,10 +1432,10 @@ void Gemmer::VecWriteBasic(int n, float *c, float *C, int ldc) {
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void Gemmer::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {} void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void Gemmer::VecWriteWithAdd(int n, float *c, float *C, int ldc) { void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -1476,7 +1473,7 @@ void Gemmer::VecWriteWithAdd(int n, float *c, float *C, int ldc) { ...@@ -1476,7 +1473,7 @@ void Gemmer::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void Gemmer::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -1524,8 +1521,8 @@ void Gemmer::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { ...@@ -1524,8 +1521,8 @@ void Gemmer::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void Gemmer::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
float *bias) { float *bias) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -1591,8 +1588,8 @@ void Gemmer::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, ...@@ -1591,8 +1588,8 @@ void Gemmer::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void Gemmer::VecWriteWithBnRelu(int n, float *c, float *C, int ldc, void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
float *scale, float *bias) { float *bias) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
// 矩阵取值运算宏,假设矩阵按行存储 // 矩阵取值运算宏,假设矩阵按行存储
#define A(i, j) A[(i)*lda + (j)] #define A(i, j) A[(i)*lda + (j)]
...@@ -28,111 +27,88 @@ limitations under the License. */ ...@@ -28,111 +27,88 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
struct Gemmer {
int MC = 0;
int KC = 0;
int NC = 0;
float *packedA; // 将 A 矩阵分块复制到连续内存(ColMajor)
float *packedB; void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *packedC; float *buffer);
float *zero;
static std::vector<Gemmer *> gemmers; // 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 将 A 矩阵分块复制到连续内存(ColMajor) float *buffer);
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer); // 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
// 将 B 矩阵分块复制到连续内存(ColMajor) float *buffer);
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); // 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
// 将 A 矩阵分块复制到连续内存(RowMajor) float *buffer);
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer); // 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
// 将 B 矩阵分块复制到连续内存(RowMajor) float beta, float *c, float *C, int ldc, bool relu);
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc,
// 分块矩阵乘法 bool relu, float *new_scale, float *new_bias);
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
float beta, float *c, float *C, int ldc, bool relu); // 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, const float *B, int ldb, float beta, float *C, int ldc,
const float *b, float beta, float *c, float *C, bool relu);
int ldc, bool relu, float *new_scale, float *new_bias);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
// 向量矩阵乘法 (M = 1) int lda, const float *B, int ldb, float beta, float *C,
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, int ldc, bool relu, float *new_scale, float *new_bias);
const float *B, int ldb, float beta, float *C, int ldc,
bool relu); // 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
int lda, const float *B, int ldb, float beta,
float *C, int ldc, bool relu, float *new_scale, // 分块矩阵乘法结果回写
float *new_bias); // C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
// 计算一个更小的 C 矩阵分块 // C = alpha * A * B + beta * C
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc); void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc); void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
// 分块矩阵乘法结果回写 void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
// C = A * B // C = A * B, batchnorm(C)
void WriteBasic(int mc, int nc, float *c, float *C, int ldc); void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = alpha * A * B + beta * C // C = A * B, batchnorm(C), relu(C)
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc); void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias);
// C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc); // 向量矩阵乘法结果回写
// C = A * B
// C = A * B + C, relu(C) void VecWriteBasic(int n, float *c, float *C, int ldc);
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc); // C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C) // C = A * B + C
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, void VecWriteWithAdd(int n, float *c, float *C, int ldc);
float *new_scale, float *new_bias); // C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_scale, float *new_bias); float *new_bias);
// C = A * B, batchnorm(C), relu(C)
// 向量矩阵乘法结果回写 void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
// C = A * B float *new_bias);
void VecWriteBasic(int n, float *c, float *C, int ldc);
// 32位 float 矩阵乘法
// C = alpha * A * B + beta * C void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc); const float *B, int ldb, float beta, float *C, int ldc, bool relu);
// C = A * B + C // 32位 float 矩阵乘法, 并对结果进行 batchnrom
void VecWriteWithAdd(int n, float *c, float *C, int ldc); void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
// C = A * B + C, relu(C) bool relu, float *new_scale, float *new_bias);
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// 64位 double 矩阵乘法
// C = A * B, batchnorm(C) void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale, const double *B, int ldb, float beta, double *C, int ldc);
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
// 64位 double 矩阵乘法
void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
const double *B, int ldb, float beta, double *C, int ldc);
};
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -26,14 +26,23 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -26,14 +26,23 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
// PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
// dim_out.size() ==
// 2,
// "The input and output of matmul be matrix");
//
// PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
// platform::is_cpu_place(matrix_b.place())
// &&
// platform::is_cpu_place(matrix_out->place()),
// "Matrix must all be in CPUPlace");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
Gemmer::gemmers[0]->Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
matrix_b.data<float>(), N, beta, beta, matrix_out->data<float>(), N, relu);
matrix_out->data<float>(), N, relu);
} }
template <> template <>
...@@ -45,15 +54,24 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -45,15 +54,24 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
// PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
// dim_out.size() ==
// 2,
// "The input and output of matmul be matrix");
//
// PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
// platform::is_cpu_place(matrix_b.place())
// &&
// platform::is_cpu_place(matrix_out->place()),
// "Matrix must all be in CPUPlace");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
Gemmer::gemmers[0]->SgemmWithBn( SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N, N, beta, matrix_out->data<float>(), N, relu,
beta, matrix_out->data<float>(), N, relu, new_scale->data<float>(), new_scale->data<float>(), new_bias->data<float>());
new_bias->data<float>());
} }
} // namespace math } // namespace math
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#endif #endif
#include "framework/tensor.h" #include "framework/tensor.h"
#include "pool_3x3.h" #include "pool_3x3.h"
#ifdef __ARM_NEON #if __ARM_NEON
#include <arm_neon.h> #include <arm_neon.h>
#endif // __ARM_NEON #endif // __ARM_NEON
#include <climits> #include <climits>
...@@ -30,7 +30,7 @@ using std::max; ...@@ -30,7 +30,7 @@ using std::max;
using std::min; using std::min;
using std::vector; using std::vector;
void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
#ifdef __ARM_NEON #if __ARM_NEON
const int batch_size = input->dims()[0]; const int batch_size = input->dims()[0];
const int h_in = input->dims()[2]; const int h_in = input->dims()[2];
...@@ -280,7 +280,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { ...@@ -280,7 +280,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
} }
void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
#ifdef __ARM_NEON #if __ARM_NEON
const int batch_size = input->dims()[0]; const int batch_size = input->dims()[0];
const int h_in = input->dims()[2]; const int h_in = input->dims()[2];
...@@ -523,7 +523,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { ...@@ -523,7 +523,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input, void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output) { Tensor *output) {
#ifdef __ARM_NEON #if __ARM_NEON
const int batch_size = input->dims()[0]; const int batch_size = input->dims()[0];
const int input_height = input->dims()[2]; const int input_height = input->dims()[2];
...@@ -582,7 +582,7 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -582,7 +582,7 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
} }
output_seg[ph * output_width + pw] = max_value; output_seg[ph * output_width + pw] = max_value;
} else { } else {
#ifdef ARMV7 #if defined(ARMV7)
asm volatile( asm volatile(
"vld1.32 {q1}, [%[pos1]] \n\t" "vld1.32 {q1}, [%[pos1]] \n\t"
"vld1.32 {q2}, [%[pos2]] \n\t" "vld1.32 {q2}, [%[pos2]] \n\t"
...@@ -622,7 +622,7 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -622,7 +622,7 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input, void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output) { Tensor *output) {
#ifdef __ARM_NEON #if __ARM_NEON
const int batch_size = input->dims()[0]; const int batch_size = input->dims()[0];
const int input_height = input->dims()[2]; const int input_height = input->dims()[2];
...@@ -676,7 +676,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -676,7 +676,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
} }
output_seg[ph * output_width + pw] = sum / 9.0; output_seg[ph * output_width + pw] = sum / 9.0;
} else { } else {
#ifdef ARMV7 #if defined(ARMV7)
asm volatile( asm volatile(
"vld1.32 {q1}, [%[pos1]] \n\t" "vld1.32 {q1}, [%[pos1]] \n\t"
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "framework/tensor.h" #include "framework/tensor.h"
#ifdef __ARM_NEON #if __ARM_NEON
#include <arm_neon.h> #include <arm_neon.h>
#endif // __ARM_NEON #endif // __ARM_NEON
......
...@@ -60,8 +60,8 @@ class PoolFunctor<CPU, PoolProcess, T> { ...@@ -60,8 +60,8 @@ class PoolFunctor<CPU, PoolProcess, T> {
T *output_data = output->mutable_data<T>(); T *output_data = output->mutable_data<T>();
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
#pragma omp parallel for #pragma omp parallel for
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < output_height; ++ph) { for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height; int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height); int hend = std::min(hstart + ksize_height, input_height);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册