提交 70356caf 编写于 作者: Z zhaojiaying01

Revert: accelerate with openmp

上级 03581867
...@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.0) ...@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.0)
project(paddle-mobile) project(paddle-mobile)
option(DEBUGING "enable debug mode" ON) option(DEBUGING "enable debug mode" ON)
option(USE_OPENMP "openmp support" ON) option(USE_OPENMP "openmp support" OFF)
option(USE_EXCEPTION "use std exception" ON) option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" ON) option(LOG_PROFILE "log profile" ON)
# select the platform to build # select the platform to build
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "io/executor.h" #include "io/executor.h"
#include <operators/math/gemm.h>
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "common/enforce.h" #include "common/enforce.h"
...@@ -26,9 +25,6 @@ limitations under the License. */ ...@@ -26,9 +25,6 @@ limitations under the License. */
#include "framework/program/var_desc.h" #include "framework/program/var_desc.h"
#include "framework/scope.h" #include "framework/scope.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#ifdef _OPENMP
#include <omp.h>
#endif // _OPENMP
#ifdef PADDLE_EXECUTOR_MULTITHREAD #ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <queue> #include <queue>
#include <utility> #include <utility>
...@@ -407,17 +403,6 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict( ...@@ -407,17 +403,6 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
return result_vector; return result_vector;
} }
template <typename Dtype, Precision P>
void Executor<Dtype, P>::SetThreadNum(int num) {
for (int k = 0; k < std::max(num, 3); ++k) {
operators::math::Gemmer::gemmers.push_back(new operators::math::Gemmer());
}
#ifdef _OPENMP
// omp_set_dynamic(0);
omp_set_num_threads(num);
#endif
}
template class Executor<CPU, Precision::FP32>; template class Executor<CPU, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>; template class Executor<FPGA, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>; template class Executor<GPU_MALI, Precision::FP32>;
......
...@@ -58,8 +58,6 @@ class Executor { ...@@ -58,8 +58,6 @@ class Executor {
std::vector<Ptype> Predict(const std::vector<Ptype> &input, std::vector<Ptype> Predict(const std::vector<Ptype> &input,
const std::vector<int64_t> &dims); const std::vector<int64_t> &dims);
void SetThreadNum(int num);
protected: protected:
Executor() = default; Executor() = default;
void InitMemory(); void InitMemory();
......
...@@ -14,14 +14,10 @@ limitations under the License. */ ...@@ -14,14 +14,10 @@ limitations under the License. */
#ifdef FUSION_CONVADD_OP #ifdef FUSION_CONVADD_OP
#pragma once #pragma once
#if _OPENMP
#include <omp.h>
#endif
#include <vector> #include <vector>
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv_3x3.h" #include "operators/math/depthwise_conv_3x3.h"
#include "operators/math/gemm.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include "operators/math/vol2col.h" #include "operators/math/vol2col.h"
...@@ -110,33 +106,9 @@ void ConvAddBasic(const FusionConvAddParam &param) { ...@@ -110,33 +106,9 @@ void ConvAddBasic(const FusionConvAddParam &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
auto dim_a = filter_slice.dims(); static_cast<float>(1), &out_slice,
auto dim_b = col_matrix.dims(); static_cast<float>(1));
auto dim_out = out_slice.dims();
int m = dim_out[0];
int n = dim_out[1];
int k = dim_a[1];
float *output_data = out_slice.data<float>();
int thread_num = 4;
int m1 = m / thread_num;
int m2 = m % thread_num;
#pragma omp parallel for
for (int j = 0; j < thread_num; ++j) {
int row_count = m1;
if (j == thread_num - 1) {
row_count = m1 + m2;
}
math::Gemmer::gemmers[j]->Sgemm(
row_count, n, k, 1, filter_slice.data<float>() + j * m1 * k, k,
col_matrix.data<float>(), n, 1, output_data + j * m1 * n, n, false);
}
// math::matmul<float>(filter_slice, false, col_matrix, false,
// static_cast<float>(1), &out_slice,
// static_cast<float>(1));
} }
} }
} }
......
...@@ -12,25 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,25 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef LRN_OP #pragma once
#ifdef _OPENMP
#include <omp.h>
#endif
#include "framework/operator.h"
#include "operators/op_param.h"
#include <cmath> #include <cmath>
#include "framework/operator.h"
#include "operators/op_param.h"
#ifdef __ARM_NEON #ifdef __ARM_NEON
#include "arm_neon.h" #include <arm_neon.h>
#include "operators/math/math_func_neon.h" #include "operators/math/math_func_neon.h"
#endif #endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using namespace framework;
template <typename T> template <typename T>
struct LRNFunctor { struct LRNFunctor {
void operator()(const framework::Tensor &input, framework::Tensor *out, int N, void operator()(const framework::Tensor &input, framework::Tensor *out, int N,
...@@ -49,7 +44,6 @@ struct LRNFunctor { ...@@ -49,7 +44,6 @@ struct LRNFunctor {
std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0); std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);
for (int a = 0; a < N; a++) { for (int a = 0; a < N; a++) {
#pragma parallel for
for (int b = 0; b < C; b++) { for (int b = 0; b < C; b++) {
for (int index = start; index < end; index++) { for (int index = start; index < end; index++) {
int channel = b + index; int channel = b + index;
......
...@@ -22,10 +22,16 @@ limitations under the License. */ ...@@ -22,10 +22,16 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
int MC = 0;
std::vector<Gemmer *> Gemmer::gemmers; int KC = 0;
int NC = 0;
float *packedA;
float *packedB;
float *packedC;
float *zero;
// 将A矩阵分块复制到连续内存(ColMajor) // 将A矩阵分块复制到连续内存(ColMajor)
void Gemmer::PackMatrixA(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
int i, j; int i, j;
const float *Aij; const float *Aij;
...@@ -52,7 +58,7 @@ void Gemmer::PackMatrixA(int m, int k, int m_tail, const float *A, int lda, ...@@ -52,7 +58,7 @@ void Gemmer::PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
} }
// 将A矩阵分块复制到连续内存(RowMajor) // 将A矩阵分块复制到连续内存(RowMajor)
void Gemmer::PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const float *a0, *a1, *a2, *a3; const float *a0, *a1, *a2, *a3;
for (int i = 0; i < m - m_tail; i += MR) { for (int i = 0; i < m - m_tail; i += MR) {
...@@ -92,7 +98,7 @@ void Gemmer::PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, ...@@ -92,7 +98,7 @@ void Gemmer::PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
} }
// 将B矩阵分块复制到连续内存(ColMajor) // 将B矩阵分块复制到连续内存(ColMajor)
void Gemmer::PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
int i, j; int i, j;
const float *Bj, *Bj1, *Bj2, *Bj3; const float *Bj, *Bj1, *Bj2, *Bj3;
...@@ -121,7 +127,7 @@ void Gemmer::PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -121,7 +127,7 @@ void Gemmer::PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
} }
// 将B矩阵分块复制到连续内存(RowMajor) // 将B矩阵分块复制到连续内存(RowMajor)
void Gemmer::PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const float *b0; const float *b0;
for (int j = 0; j < n - n_tail; j += NR) { for (int j = 0; j < n - n_tail; j += NR) {
...@@ -150,9 +156,8 @@ void Gemmer::PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, ...@@ -150,9 +156,8 @@ void Gemmer::PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void Gemmer::InnerKernel(int mc, int nc, float alpha, const float *a, void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
const float *b, float beta, float *c, float *C, float beta, float *c, float *C, int ldc, bool relu) {
int ldc, bool relu) {
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
...@@ -179,10 +184,9 @@ void Gemmer::InnerKernel(int mc, int nc, float alpha, const float *a, ...@@ -179,10 +184,9 @@ void Gemmer::InnerKernel(int mc, int nc, float alpha, const float *a,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void Gemmer::InnerKernelWithBn(int mc, int nc, float alpha, const float *a, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C, int ldc,
int ldc, bool relu, float *new_scale, bool relu, float *new_scale, float *new_bias) {
float *new_bias) {
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
...@@ -198,8 +202,7 @@ void Gemmer::InnerKernelWithBn(int mc, int nc, float alpha, const float *a, ...@@ -198,8 +202,7 @@ void Gemmer::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
} }
#if defined(IOS) #if defined(IOS)
void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *C, void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
int ldc) {
// init C // init C
float32x4_t cv0 = vdupq_n_f32(0.0); float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0); float32x4_t cv1 = vdupq_n_f32(0.0);
...@@ -250,8 +253,7 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *C, ...@@ -250,8 +253,7 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *C,
} // namespace math } // namespace math
#elif defined(ARMV7) #elif defined(ARMV7)
void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c, void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -322,8 +324,7 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c, ...@@ -322,8 +324,7 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
} }
#else #else
void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c, void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
int ldc) {
float *c0, *c1, *c2, *c3; float *c0, *c1, *c2, *c3;
c0 = c; c0 = c;
c1 = c + ldc; c1 = c + ldc;
...@@ -362,9 +363,8 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c, ...@@ -362,9 +363,8 @@ void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
#endif #endif
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Gemmer::Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
bool relu) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 30 * 1024; int L1 = 30 * 1024;
...@@ -415,10 +415,9 @@ void Gemmer::Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -415,10 +415,9 @@ void Gemmer::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void Gemmer::SgemmWithBn(int m, int n, int k, float alpha, const float *A, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
int lda, const float *B, int ldb, float beta, float *C, const float *B, int ldb, float beta, float *C, int ldc,
int ldc, bool relu, float *new_scale, bool relu, float *new_scale, float *new_bias) {
float *new_bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 30 * 1024; int L1 = 30 * 1024;
...@@ -469,9 +468,9 @@ void Gemmer::SgemmWithBn(int m, int n, int k, float alpha, const float *A, ...@@ -469,9 +468,9 @@ void Gemmer::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void Gemmer::VectorKernel(int m, int n, int k, float alpha, const float *A, void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
int lda, const float *B, int ldb, float beta, const float *B, int ldb, float beta, float *C, int ldc,
float *C, int ldc, bool relu) { bool relu) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n)); float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3; const float *a0, *b0, *b1, *b2, *b3;
...@@ -691,10 +690,9 @@ void Gemmer::VectorKernel(int m, int n, int k, float alpha, const float *A, ...@@ -691,10 +690,9 @@ void Gemmer::VectorKernel(int m, int n, int k, float alpha, const float *A,
} }
} }
void Gemmer::VectorKernelWithBn(int m, int n, int k, float alpha, void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
const float *A, int lda, const float *B, int lda, const float *B, int ldb, float beta, float *C,
int ldb, float beta, float *C, int ldc, int ldc, bool relu, float *new_scale, float *new_bias) {
bool relu, float *new_scale, float *new_bias) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n)); float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3; const float *a0, *b0, *b1, *b2, *b3;
...@@ -903,8 +901,7 @@ void Gemmer::VectorKernelWithBn(int m, int n, int k, float alpha, ...@@ -903,8 +901,7 @@ void Gemmer::VectorKernelWithBn(int m, int n, int k, float alpha,
} }
} }
void Gemmer::AddDot4x8(int k, const float *a, const float *b, float *c, void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -1012,7 +1009,7 @@ void Gemmer::AddDot4x8(int k, const float *a, const float *b, float *c, ...@@ -1012,7 +1009,7 @@ void Gemmer::AddDot4x8(int k, const float *a, const float *b, float *c,
} }
// C = A * B // C = A * B
void Gemmer::WriteBasic(int mc, int nc, float *c, float *C, int ldc) { void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1069,10 +1066,10 @@ void Gemmer::WriteBasic(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1069,10 +1066,10 @@ void Gemmer::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void Gemmer::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void Gemmer::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1136,7 +1133,7 @@ void Gemmer::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1136,7 +1133,7 @@ void Gemmer::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void Gemmer::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1210,8 +1207,8 @@ void Gemmer::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1210,8 +1207,8 @@ void Gemmer::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void Gemmer::WriteWithBn(int mc, int nc, float *c, float *C, int ldc, void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
float *scale, float *bias) { float *bias) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -1296,8 +1293,8 @@ void Gemmer::WriteWithBn(int mc, int nc, float *c, float *C, int ldc, ...@@ -1296,8 +1293,8 @@ void Gemmer::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void Gemmer::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
float *scale, float *bias) { float *bias) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -1389,7 +1386,7 @@ void Gemmer::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -1389,7 +1386,7 @@ void Gemmer::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B // C = A * B
void Gemmer::VecWriteBasic(int n, float *c, float *C, int ldc) { void VecWriteBasic(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -1435,10 +1432,10 @@ void Gemmer::VecWriteBasic(int n, float *c, float *C, int ldc) { ...@@ -1435,10 +1432,10 @@ void Gemmer::VecWriteBasic(int n, float *c, float *C, int ldc) {
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void Gemmer::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {} void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void Gemmer::VecWriteWithAdd(int n, float *c, float *C, int ldc) { void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -1476,7 +1473,7 @@ void Gemmer::VecWriteWithAdd(int n, float *c, float *C, int ldc) { ...@@ -1476,7 +1473,7 @@ void Gemmer::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void Gemmer::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -1524,7 +1521,7 @@ void Gemmer::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { ...@@ -1524,7 +1521,7 @@ void Gemmer::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void Gemmer::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
float *bias) { float *bias) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -1591,8 +1588,8 @@ void Gemmer::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, ...@@ -1591,8 +1588,8 @@ void Gemmer::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void Gemmer::VecWriteWithBnRelu(int n, float *c, float *C, int ldc, void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
float *scale, float *bias) { float *bias) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
// 矩阵取值运算宏,假设矩阵按行存储 // 矩阵取值运算宏,假设矩阵按行存储
#define A(i, j) A[(i)*lda + (j)] #define A(i, j) A[(i)*lda + (j)]
...@@ -28,111 +27,88 @@ limitations under the License. */ ...@@ -28,111 +27,88 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
struct Gemmer {
int MC = 0; // 将 A 矩阵分块复制到连续内存(ColMajor)
int KC = 0; void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
int NC = 0;
float *packedA;
float *packedB;
float *packedC;
float *zero;
static std::vector<Gemmer *> gemmers;
// 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
// 将 B 矩阵分块复制到连续内存(ColMajor) // 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
// 将 A 矩阵分块复制到连续内存(RowMajor) // 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
// 将 B 矩阵分块复制到连续内存(RowMajor) // 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
float beta, float *c, float *C, int ldc, bool relu); float beta, float *c, float *C, int ldc, bool relu);
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C, int ldc,
int ldc, bool relu, float *new_scale, float *new_bias); bool relu, float *new_scale, float *new_bias);
// 向量矩阵乘法 (M = 1) // 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu); bool relu);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, int lda, const float *B, int ldb, float beta, float *C,
float *C, int ldc, bool relu, float *new_scale, int ldc, bool relu, float *new_scale, float *new_bias);
float *new_bias);
// 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
// 分块矩阵乘法结果回写
// C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias);
// C = A * B, batchnorm(C), relu(C) // 计算一个更小的 C 矩阵分块
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
// 分块矩阵乘法结果回写
// C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias); float *new_scale, float *new_bias);
// 向量矩阵乘法结果回写 // 向量矩阵乘法结果回写
// C = A * B // C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc); void VecWriteBasic(int n, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
// C = alpha * A * B + beta * C void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc); // C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C // C = A * B + C, relu(C)
void VecWriteWithAdd(int n, float *c, float *C, int ldc); void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
// C = A * B + C, relu(C) void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias); float *new_bias);
// C = A * B, batchnorm(C), relu(C)
// C = A * B, batchnorm(C), relu(C) void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias); float *new_bias);
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu); const float *B, int ldb, float beta, float *C, int ldc, bool relu);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom // 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias); bool relu, float *new_scale, float *new_bias);
// 64位 double 矩阵乘法 // 64位 double 矩阵乘法
void dgemm(int m, int n, int k, float alpha, const double *A, int lda, void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
const double *B, int ldb, float beta, double *C, int ldc); const double *B, int ldb, float beta, double *C, int ldc);
};
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -26,14 +26,23 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -26,14 +26,23 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
// PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
// dim_out.size() ==
// 2,
// "The input and output of matmul be matrix");
//
// PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
// platform::is_cpu_place(matrix_b.place())
// &&
// platform::is_cpu_place(matrix_out->place()),
// "Matrix must all be in CPUPlace");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
Gemmer::gemmers[0]->Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
matrix_b.data<float>(), N, beta, beta, matrix_out->data<float>(), N, relu);
matrix_out->data<float>(), N, relu);
} }
template <> template <>
...@@ -45,15 +54,24 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -45,15 +54,24 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
// PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
// dim_out.size() ==
// 2,
// "The input and output of matmul be matrix");
//
// PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
// platform::is_cpu_place(matrix_b.place())
// &&
// platform::is_cpu_place(matrix_out->place()),
// "Matrix must all be in CPUPlace");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
Gemmer::gemmers[0]->SgemmWithBn( SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N, N, beta, matrix_out->data<float>(), N, relu,
beta, matrix_out->data<float>(), N, relu, new_scale->data<float>(), new_scale->data<float>(), new_bias->data<float>());
new_bias->data<float>());
} }
} // namespace math } // namespace math
......
此差异已折叠。
...@@ -15,9 +15,6 @@ limitations under the License. */ ...@@ -15,9 +15,6 @@ limitations under the License. */
#ifdef POOL_OP #ifdef POOL_OP
#pragma once #pragma once
#ifdef _OPENMP
#include <omp.h>
#endif
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "framework/tensor.h" #include "framework/tensor.h"
......
...@@ -14,11 +14,10 @@ limitations under the License. */ ...@@ -14,11 +14,10 @@ limitations under the License. */
#ifdef POOL_OP #ifdef POOL_OP
#include "pooling.h" #include "operators/math/pooling.h"
#include <algorithm>
#include <vector>
#include "common/types.h" #include "common/types.h"
#ifdef _OPENMP
#include <omp.h>
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -60,8 +59,8 @@ class PoolFunctor<CPU, PoolProcess, T> { ...@@ -60,8 +59,8 @@ class PoolFunctor<CPU, PoolProcess, T> {
T *output_data = output->mutable_data<T>(); T *output_data = output->mutable_data<T>();
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
// #pragma omp parallel for
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
#pragma omp parallel for
for (int ph = 0; ph < output_height; ++ph) { for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height; int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height); int hend = std::min(hstart + ksize_height, input_height);
......
...@@ -26,17 +26,16 @@ int main() { ...@@ -26,17 +26,16 @@ int main() {
auto time2 = time(); auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time2) << "ms\n"; DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize); paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
executor.SetThreadNum(4);
std::vector<float> input; std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims); GetInput<float>(g_test_image_1x3x224x224, &input, dims);
auto time3 = time(); auto time3 = time();
int count = 1;
for (int i = 0; i < count; ++i) { for (int i = 0; i < 10; ++i) {
executor.Predict(input, dims); executor.Predict(input, dims);
} }
auto time4 = time(); auto time4 = time();
DLOG << "predict cost :" << time_diff(time3, time4) / count << "ms\n"; DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
return 0; return 0;
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册