提交 e2ae2ae3 编写于 作者: E eclipsycn 提交者: GitHub

Merge branch 'develop' into develop

...@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.0) ...@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.0)
project(paddle-mobile) project(paddle-mobile)
option(DEBUGING "enable debug mode" ON) option(DEBUGING "enable debug mode" ON)
option(USE_OPENMP "openmp support" OFF) option(USE_OPENMP "openmp support" ON)
option(USE_EXCEPTION "use std exception" ON) option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" ON) option(LOG_PROFILE "log profile" ON)
# select the platform to build # select the platform to build
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "io/executor.h" #include "io/executor.h"
#include <operators/math/gemm.h>
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "common/enforce.h" #include "common/enforce.h"
...@@ -25,6 +26,9 @@ limitations under the License. */ ...@@ -25,6 +26,9 @@ limitations under the License. */
#include "framework/program/var_desc.h" #include "framework/program/var_desc.h"
#include "framework/scope.h" #include "framework/scope.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#ifdef _OPENMP
#include <omp.h>
#endif // _OPENMP
#ifdef PADDLE_EXECUTOR_MULTITHREAD #ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <queue> #include <queue>
#include <utility> #include <utility>
...@@ -403,6 +407,17 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict( ...@@ -403,6 +407,17 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
return result_vector; return result_vector;
} }
template <typename Dtype, Precision P>
void Executor<Dtype, P>::SetThreadNum(int num) {
for (int k = 0; k < std::max(num, 3); ++k) {
operators::math::Gemmer::gemmers.push_back(new operators::math::Gemmer());
}
#ifdef _OPENMP
// omp_set_dynamic(0);
omp_set_num_threads(num);
#endif
}
template class Executor<CPU, Precision::FP32>; template class Executor<CPU, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>; template class Executor<FPGA, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>; template class Executor<GPU_MALI, Precision::FP32>;
......
...@@ -58,6 +58,8 @@ class Executor { ...@@ -58,6 +58,8 @@ class Executor {
std::vector<Ptype> Predict(const std::vector<Ptype> &input, std::vector<Ptype> Predict(const std::vector<Ptype> &input,
const std::vector<int64_t> &dims); const std::vector<int64_t> &dims);
void SetThreadNum(int num);
protected: protected:
Executor() = default; Executor() = default;
void InitMemory(); void InitMemory();
......
...@@ -14,10 +14,14 @@ limitations under the License. */ ...@@ -14,10 +14,14 @@ limitations under the License. */
#ifdef FUSION_CONVADD_OP #ifdef FUSION_CONVADD_OP
#pragma once #pragma once
#if _OPENMP
#include <omp.h>
#endif
#include <vector> #include <vector>
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv_3x3.h" #include "operators/math/depthwise_conv_3x3.h"
#include "operators/math/gemm.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include "operators/math/vol2col.h" #include "operators/math/vol2col.h"
...@@ -106,9 +110,33 @@ void ConvAddBasic(const FusionConvAddParam &param) { ...@@ -106,9 +110,33 @@ void ConvAddBasic(const FusionConvAddParam &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, auto dim_a = filter_slice.dims();
static_cast<float>(1)); auto dim_b = col_matrix.dims();
auto dim_out = out_slice.dims();
int m = dim_out[0];
int n = dim_out[1];
int k = dim_a[1];
float *output_data = out_slice.data<float>();
int thread_num = 4;
int m1 = m / thread_num;
int m2 = m % thread_num;
#pragma omp parallel for
for (int j = 0; j < thread_num; ++j) {
int row_count = m1;
if (j == thread_num - 1) {
row_count = m1 + m2;
}
math::Gemmer::gemmers[j]->Sgemm(
row_count, n, k, 1, filter_slice.data<float>() + j * m1 * k, k,
col_matrix.data<float>(), n, 1, output_data + j * m1 * n, n, false);
}
// math::matmul<float>(filter_slice, false, col_matrix, false,
// static_cast<float>(1), &out_slice,
// static_cast<float>(1));
} }
} }
} }
......
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef LRN_OP #ifdef LRN_OP
#ifdef _OPENMP
#include <omp.h>
#endif
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h" #include "operators/op_param.h"
...@@ -47,6 +49,7 @@ struct LRNFunctor { ...@@ -47,6 +49,7 @@ struct LRNFunctor {
std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0); std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);
for (int a = 0; a < N; a++) { for (int a = 0; a < N; a++) {
#pragma parallel for
for (int b = 0; b < C; b++) { for (int b = 0; b < C; b++) {
for (int index = start; index < end; index++) { for (int index = start; index < end; index++) {
int channel = b + index; int channel = b + index;
......
...@@ -22,17 +22,11 @@ limitations under the License. */ ...@@ -22,17 +22,11 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
int MC = 0;
int KC = 0; std::vector<Gemmer *> Gemmer::gemmers;
int NC = 0;
float *packedA;
float *packedB;
float *packedC;
float *zero;
// 将A矩阵分块复制到连续内存(ColMajor) // 将A矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, void Gemmer::PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
int i, j; int i, j;
const float *Aij; const float *Aij;
for (i = 0; i < m - m_tail; i += MR) { for (i = 0; i < m - m_tail; i += MR) {
...@@ -58,8 +52,8 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, ...@@ -58,8 +52,8 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
} }
// 将A矩阵分块复制到连续内存(RowMajor) // 将A矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, void Gemmer::PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const float *a0, *a1, *a2, *a3; const float *a0, *a1, *a2, *a3;
for (int i = 0; i < m - m_tail; i += MR) { for (int i = 0; i < m - m_tail; i += MR) {
a0 = A + i * lda; a0 = A + i * lda;
...@@ -98,8 +92,8 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, ...@@ -98,8 +92,8 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
} }
// 将B矩阵分块复制到连续内存(ColMajor) // 将B矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, void Gemmer::PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
int i, j; int i, j;
const float *Bj, *Bj1, *Bj2, *Bj3; const float *Bj, *Bj1, *Bj2, *Bj3;
for (j = 0; j < n - n_tail; j += NR) { for (j = 0; j < n - n_tail; j += NR) {
...@@ -127,8 +121,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -127,8 +121,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
} }
// 将B矩阵分块复制到连续内存(RowMajor) // 将B矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, void Gemmer::PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const float *b0; const float *b0;
for (int j = 0; j < n - n_tail; j += NR) { for (int j = 0; j < n - n_tail; j += NR) {
for (int i = 0; i < k; ++i) { for (int i = 0; i < k; ++i) {
...@@ -156,8 +150,9 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, ...@@ -156,8 +150,9 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void Gemmer::InnerKernel(int mc, int nc, float alpha, const float *a,
float beta, float *c, float *C, int ldc, bool relu) { const float *b, float beta, float *c, float *C,
int ldc, bool relu) {
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
...@@ -184,9 +179,10 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, ...@@ -184,9 +179,10 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, void Gemmer::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc, const float *b, float beta, float *c, float *C,
bool relu, float *new_scale, float *new_bias) { int ldc, bool relu, float *new_scale,
float *new_bias) {
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
...@@ -202,7 +198,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, ...@@ -202,7 +198,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
} }
#if defined(IOS) #if defined(IOS)
void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) { void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *C,
int ldc) {
// init C // init C
float32x4_t cv0 = vdupq_n_f32(0.0); float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0); float32x4_t cv1 = vdupq_n_f32(0.0);
...@@ -253,7 +250,8 @@ void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) { ...@@ -253,7 +250,8 @@ void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
} // namespace math } // namespace math
#elif defined(ARMV7) #elif defined(ARMV7)
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -324,7 +322,8 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -324,7 +322,8 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
} }
#else #else
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { void Gemmer::AddDot4x4(int k, const float *a, const float *b, float *c,
int ldc) {
float *c0, *c1, *c2, *c3; float *c0, *c1, *c2, *c3;
c0 = c; c0 = c;
c1 = c + ldc; c1 = c + ldc;
...@@ -363,8 +362,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -363,8 +362,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
#endif #endif
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Gemmer::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu) { const float *B, int ldb, float beta, float *C, int ldc,
bool relu) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 30 * 1024; int L1 = 30 * 1024;
...@@ -415,9 +415,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -415,9 +415,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void Gemmer::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
const float *B, int ldb, float beta, float *C, int ldc, int lda, const float *B, int ldb, float beta, float *C,
bool relu, float *new_scale, float *new_bias) { int ldc, bool relu, float *new_scale,
float *new_bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 30 * 1024; int L1 = 30 * 1024;
...@@ -468,9 +469,9 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -468,9 +469,9 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void Gemmer::VectorKernel(int m, int n, int k, float alpha, const float *A,
const float *B, int ldb, float beta, float *C, int ldc, int lda, const float *B, int ldb, float beta,
bool relu) { float *C, int ldc, bool relu) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n)); float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3; const float *a0, *b0, *b1, *b2, *b3;
...@@ -690,9 +691,10 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -690,9 +691,10 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
} }
} }
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, void Gemmer::VectorKernelWithBn(int m, int n, int k, float alpha,
int lda, const float *B, int ldb, float beta, float *C, const float *A, int lda, const float *B,
int ldc, bool relu, float *new_scale, float *new_bias) { int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n)); float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3; const float *a0, *b0, *b1, *b2, *b3;
...@@ -901,7 +903,8 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, ...@@ -901,7 +903,8 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
} }
} }
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { void Gemmer::AddDot4x8(int k, const float *a, const float *b, float *c,
int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -1009,7 +1012,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -1009,7 +1012,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
} }
// C = A * B // C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { void Gemmer::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1066,10 +1069,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1066,10 +1069,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} void Gemmer::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { void Gemmer::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1133,7 +1136,7 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1133,7 +1136,7 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { void Gemmer::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1207,8 +1210,8 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1207,8 +1210,8 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale, void Gemmer::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
float *bias) { float *scale, float *bias) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -1293,8 +1296,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale, ...@@ -1293,8 +1296,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale, void Gemmer::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *bias) { float *scale, float *bias) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -1386,7 +1389,7 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale, ...@@ -1386,7 +1389,7 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
} }
// C = A * B // C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc) { void Gemmer::VecWriteBasic(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -1432,10 +1435,10 @@ void VecWriteBasic(int n, float *c, float *C, int ldc) { ...@@ -1432,10 +1435,10 @@ void VecWriteBasic(int n, float *c, float *C, int ldc) {
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {} void Gemmer::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc) { void Gemmer::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -1473,7 +1476,7 @@ void VecWriteWithAdd(int n, float *c, float *C, int ldc) { ...@@ -1473,7 +1476,7 @@ void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { void Gemmer::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -1521,8 +1524,8 @@ void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { ...@@ -1521,8 +1524,8 @@ void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, void Gemmer::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
float *bias) { float *bias) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -1588,8 +1591,8 @@ void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, ...@@ -1588,8 +1591,8 @@ void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale, void Gemmer::VecWriteWithBnRelu(int n, float *c, float *C, int ldc,
float *bias) { float *scale, float *bias) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
// 矩阵取值运算宏,假设矩阵按行存储 // 矩阵取值运算宏,假设矩阵按行存储
#define A(i, j) A[(i)*lda + (j)] #define A(i, j) A[(i)*lda + (j)]
...@@ -27,88 +28,111 @@ limitations under the License. */ ...@@ -27,88 +28,111 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
struct Gemmer {
int MC = 0;
int KC = 0;
int NC = 0;
// 将 A 矩阵分块复制到连续内存(ColMajor) float *packedA;
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, float *packedB;
float *buffer); float *packedC;
float *zero;
// 将 B 矩阵分块复制到连续内存(ColMajor) static std::vector<Gemmer *> gemmers;
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); // 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
// 将 A 矩阵分块复制到连续内存(RowMajor) float *buffer);
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer); // 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 将 B 矩阵分块复制到连续内存(RowMajor) float *buffer);
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); // 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
// 分块矩阵乘法 float *buffer);
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
float beta, float *c, float *C, int ldc, bool relu); // 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, float *buffer);
const float *b, float beta, float *c, float *C, int ldc,
bool relu, float *new_scale, float *new_bias); // 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
// 向量矩阵乘法 (M = 1) float beta, float *c, float *C, int ldc, bool relu);
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
bool relu); const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *new_scale, float *new_bias);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C, // 向量矩阵乘法 (M = 1)
int ldc, bool relu, float *new_scale, float *new_bias); void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
// 计算一个更小的 C 矩阵分块 bool relu);
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc); void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta,
// 分块矩阵乘法结果回写 float *C, int ldc, bool relu, float *new_scale,
// C = A * B float *new_bias);
void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C // 计算一个更小的 C 矩阵分块
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc); void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
// C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc); void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
// C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc); // 分块矩阵乘法结果回写
// C = A * B, batchnorm(C) // C = A * B
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
float *new_bias);
// C = A * B, batchnorm(C), relu(C) // C = alpha * A * B + beta * C
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
float *new_scale, float *new_bias);
// C = A * B + C
// 向量矩阵乘法结果回写 void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc); // C = A * B + C, relu(C)
// C = alpha * A * B + beta * C void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// C = A * B + C // C = A * B, batchnorm(C)
void VecWriteWithAdd(int n, float *c, float *C, int ldc); void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
// C = A * B + C, relu(C) float *new_scale, float *new_bias);
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C), relu(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale, void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_bias); float *new_scale, float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale, // 向量矩阵乘法结果回写
float *new_bias); // C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc);
// 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, // C = alpha * A * B + beta * C
const float *B, int ldb, float beta, float *C, int ldc, bool relu); void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom // C = A * B + C
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void VecWriteWithAdd(int n, float *c, float *C, int ldc);
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias); // C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// 64位 double 矩阵乘法
void dgemm(int m, int n, int k, float alpha, const double *A, int lda, // C = A * B, batchnorm(C)
const double *B, int ldb, float beta, double *C, int ldc); void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
// 64位 double 矩阵乘法
void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
const double *B, int ldb, float beta, double *C, int ldc);
};
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -26,23 +26,14 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -26,23 +26,14 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
// PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
// dim_out.size() ==
// 2,
// "The input and output of matmul be matrix");
//
// PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
// platform::is_cpu_place(matrix_b.place())
// &&
// platform::is_cpu_place(matrix_out->place()),
// "Matrix must all be in CPUPlace");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N, Gemmer::gemmers[0]->Sgemm(M, N, K, alpha, matrix_a.data<float>(), K,
beta, matrix_out->data<float>(), N, relu); matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu);
} }
template <> template <>
...@@ -54,24 +45,15 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -54,24 +45,15 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
// PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
// dim_out.size() ==
// 2,
// "The input and output of matmul be matrix");
//
// PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
// platform::is_cpu_place(matrix_b.place())
// &&
// platform::is_cpu_place(matrix_out->place()),
// "Matrix must all be in CPUPlace");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), Gemmer::gemmers[0]->SgemmWithBn(
N, beta, matrix_out->data<float>(), N, relu, M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
new_scale->data<float>(), new_bias->data<float>()); beta, matrix_out->data<float>(), N, relu, new_scale->data<float>(),
new_bias->data<float>());
} }
} // namespace math } // namespace math
......
...@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef POOL_OP #ifdef POOL_OP
#include "pool_3x3.h" #define __ARM_NEON true
#ifdef _OPENMP
#include <omp.h>
#endif
#include "framework/tensor.h" #include "framework/tensor.h"
#include "pool_3x3.h"
#if __ARM_NEON #if __ARM_NEON
#include <arm_neon.h> #include <arm_neon.h>
#endif // __ARM_NEON #endif // __ARM_NEON
...@@ -40,46 +44,52 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { ...@@ -40,46 +44,52 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
const int w_out = output->dims()[3]; const int w_out = output->dims()[3];
const int outputdata_channel_stride = h_out * w_out; const int outputdata_channel_stride = h_out * w_out;
const int inputdata_channel_stride = h_in * w_in; const int inputdata_channel_stride = h_in * w_in;
const int input_batch_stride = output_channels * inputdata_channel_stride;
const int output_batch_stride = output_channels * outputdata_channel_stride;
float *out_data = output->data<float>(); float *out_data = output->data<float>();
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
const float coef = 1.0 / 9.0; const float coef = 1.0 / 9.0;
for (int k = 0; k < batch_size; ++k) { for (int k = 0; k < batch_size; ++k) {
#pragma omp parallel for
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
const float *input_seg = input_data + c * inputdata_channel_stride;
float *output_seg = out_data + c * outputdata_channel_stride;
// four corner point // four corner point
out_data[0] = (input_data[0] + input_data[1] + input_data[w_in] + output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] +
input_data[w_in + 1]) * input_seg[w_in + 1]) *
coef; coef;
out_data[w_out - 1] = output_seg[w_out - 1] =
(input_data[w_in - 2] + input_data[w_in - 1] + (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 - 2] +
input_data[w_in * 2 - 2] + input_data[2 * w_in - 1]) * input_seg[2 * w_in - 1]) *
coef; coef;
out_data[(h_out - 1) * w_out] = output_seg[(h_out - 1) * w_out] =
(input_data[(h_in - 2) * w_in] + input_data[(h_in - 2) * w_in + 1] + (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] +
input_data[(h_in - 1) * w_in] + input_data[(h_in - 1) * w_in + 1]) * input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1]) *
coef; coef;
out_data[h_out * w_out - 1] = output_seg[h_out * w_out - 1] =
(input_data[h_in * w_in - 1] + input_data[h_in * w_in - 2] + (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] +
input_data[(h_in - 1) * w_in - 1] + input_seg[(h_in - 1) * w_in - 1] +
input_data[(h_in - 1) * w_in - 2]) * input_seg[(h_in - 1) * w_in - 2]) *
coef; coef;
// left side & right side // left side & right side
for (int i = 1; i < h_in - 1; ++i) { for (int i = 1; i < h_in - 1; ++i) {
out_data[i * w_out] = output_seg[i * w_out] =
(input_data[i * w_in - w_in] + input_data[i * w_in - w_in + 1] + (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] +
input_data[i * w_in] + input_data[i * w_in + 1] + input_seg[i * w_in] + input_seg[i * w_in + 1] +
input_data[i * w_in + w_in] + input_data[i * w_in + w_in + 1]) * input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) *
coef; coef;
out_data[i * w_out + w_out - 1] = output_seg[i * w_out + w_out - 1] =
(input_data[i * w_in - w_in + w_in - 2] + (input_seg[i * w_in - w_in + w_in - 2] +
input_data[i * w_in - w_in + 1 + w_in - 2] + input_seg[i * w_in - w_in + 1 + w_in - 2] +
input_data[i * w_in + w_in - 2] + input_seg[i * w_in + w_in - 2] +
input_data[i * w_in + 1 + w_in - 2] + input_seg[i * w_in + 1 + w_in - 2] +
input_data[i * w_in + w_in + w_in - 2] + input_seg[i * w_in + w_in + w_in - 2] +
input_data[i * w_in + w_in + 1 + w_in - 2]) * input_seg[i * w_in + w_in + 1 + w_in - 2]) *
coef; coef;
} }
// top 1 row & bottom 1 row // top 1 row & bottom 1 row
const float *input_tmp = input_data; const float *input_tmp = input_seg;
float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2, float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
tmp3, tmp4, tmp5, sum, out0; tmp3, tmp4, tmp5, sum, out0;
...@@ -90,7 +100,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { ...@@ -90,7 +100,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
in4 = vld1q_f32(input_tmp_end); in4 = vld1q_f32(input_tmp_end);
in6 = vld1q_f32(input_tmp_end + w_in); in6 = vld1q_f32(input_tmp_end + w_in);
int c_mid = w_out - 2; int c_mid = w_out - 2;
auto output_ptr = out_data + 1; auto output_ptr = output_seg + 1;
for (; c_mid > 3; c_mid -= 4) { for (; c_mid > 3; c_mid -= 4) {
in1 = vld1q_f32(input_tmp + 4); in1 = vld1q_f32(input_tmp + 4);
in3 = vld1q_f32(input_tmp + w_in + 4); in3 = vld1q_f32(input_tmp + w_in + 4);
...@@ -135,8 +145,8 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { ...@@ -135,8 +145,8 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
in6 = in7; in6 = in7;
} }
// top right remain // top right remain
float32x4_t pad0 = vdupq_n_f32(input_data[w_in - 1]); float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[2 * w_in - 1]); float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
tmp0 = vextq_f32(in0, pad0, 1); tmp0 = vextq_f32(in0, pad0, 1);
tmp1 = vextq_f32(in0, pad0, 2); tmp1 = vextq_f32(in0, pad0, 2);
...@@ -163,8 +173,8 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { ...@@ -163,8 +173,8 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
} }
// bottom_right remain // bottom_right remain
float32x4_t pad2 = vdupq_n_f32(input_data[(h_in - 1) * w_in - 1]); float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
float32x4_t pad3 = vdupq_n_f32(input_data[h_in * w_in - 1]); float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
tmp0 = vextq_f32(in4, pad2, 1); tmp0 = vextq_f32(in4, pad2, 1);
tmp1 = vextq_f32(in4, pad2, 2); tmp1 = vextq_f32(in4, pad2, 2);
...@@ -191,8 +201,8 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { ...@@ -191,8 +201,8 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
} }
// mid // mid
for (int j = 0; j < h_out - 2; ++j) { for (int j = 0; j < h_out - 2; ++j) {
output_ptr = out_data + w_out * (j + 1) + 1; output_ptr = output_seg + w_out * (j + 1) + 1;
input_tmp = input_data + j * w_in; input_tmp = input_seg + j * w_in;
in0 = vld1q_f32(input_tmp); in0 = vld1q_f32(input_tmp);
in2 = vld1q_f32(input_tmp + w_in); in2 = vld1q_f32(input_tmp + w_in);
...@@ -228,9 +238,9 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { ...@@ -228,9 +238,9 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
in4 = in5; in4 = in5;
} }
// mid remain // mid remain
float32x4_t pad0 = vdupq_n_f32(input_data[(j + 1) * w_in - 1]); float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[(j + 2) * w_in - 1]); float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
float32x4_t pad2 = vdupq_n_f32(input_data[(j + 2) * w_in - 1]); float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
tmp0 = vextq_f32(in0, pad0, 1); tmp0 = vextq_f32(in0, pad0, 1);
tmp1 = vextq_f32(in0, pad0, 2); tmp1 = vextq_f32(in0, pad0, 2);
...@@ -261,9 +271,11 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { ...@@ -261,9 +271,11 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
} }
} }
} }
input_data += inputdata_channel_stride; // input_data += inputdata_channel_stride;
out_data += outputdata_channel_stride; // out_data += outputdata_channel_stride;
} }
input_data += input_batch_stride;
out_data += output_batch_stride;
} }
#endif #endif
} }
...@@ -282,44 +294,50 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { ...@@ -282,44 +294,50 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
const int w_out = output->dims()[3]; const int w_out = output->dims()[3];
const int outputdata_channel_stride = h_out * w_out; const int outputdata_channel_stride = h_out * w_out;
const int inputdata_channel_stride = h_in * w_in; const int inputdata_channel_stride = h_in * w_in;
const int input_batch_stride = output_channels * inputdata_channel_stride;
const int output_batch_stride = output_channels * outputdata_channel_stride;
float *out_data = output->data<float>(); float *out_data = output->data<float>();
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
for (int k = 0; k < batch_size; ++k) { for (int k = 0; k < batch_size; ++k) {
#pragma omp parallel for
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
const float *input_seg = input_data + c * inputdata_channel_stride;
float *output_seg = out_data + c * outputdata_channel_stride;
// four corner point // four corner point
out_data[0] = std::max(std::max(input_data[0], input_data[1]), output_seg[0] = std::max(std::max(input_seg[0], input_seg[1]),
std::max(input_data[w_in], input_data[w_in + 1])); std::max(input_seg[w_in], input_seg[w_in + 1]));
out_data[w_out - 1] = std::max( output_seg[w_out - 1] =
std::max(input_data[w_in - 2], input_data[w_in - 1]), std::max(std::max(input_seg[w_in - 2], input_seg[w_in - 1]),
std::max(input_data[w_in * 2 - 2], input_data[2 * w_in - 1])); std::max(input_seg[w_in * 2 - 2], input_seg[2 * w_in - 1]));
out_data[(h_out - 1) * w_out] = output_seg[(h_out - 1) * w_out] =
std::max(std::max(input_data[(h_in - 2) * w_in], std::max(std::max(input_seg[(h_in - 2) * w_in],
input_data[(h_in - 2) * w_in + 1]), input_seg[(h_in - 2) * w_in + 1]),
std::max(input_data[(h_in - 1) * w_in], std::max(input_seg[(h_in - 1) * w_in],
input_data[(h_in - 1) * w_in + 1])); input_seg[(h_in - 1) * w_in + 1]));
out_data[h_out * w_out - 1] = std::max( output_seg[h_out * w_out - 1] = std::max(
std::max(input_data[(h_in - 1) * w_in - 1], std::max(input_seg[(h_in - 1) * w_in - 1],
input_data[(h_in - 1) * w_in - 2]), input_seg[(h_in - 1) * w_in - 2]),
std::max(input_data[h_in * w_in - 1], input_data[h_in * w_in - 2])); std::max(input_seg[h_in * w_in - 1], input_seg[h_in * w_in - 2]));
// left side & right side // left side & right side
for (int i = 1; i < h_in - 1; ++i) { for (int i = 1; i < h_in - 1; ++i) {
float max1 = std::max(input_data[i * w_in - w_in], float max1 = std::max(input_seg[i * w_in - w_in],
input_data[i * w_in - w_in + 1]); input_seg[i * w_in - w_in + 1]);
float max2 = std::max(input_data[i * w_in], input_data[i * w_in + 1]); float max2 = std::max(input_seg[i * w_in], input_seg[i * w_in + 1]);
float max3 = std::max(input_data[i * w_in + w_in], float max3 = std::max(input_seg[i * w_in + w_in],
input_data[i * w_in + w_in + 1]); input_seg[i * w_in + w_in + 1]);
out_data[i * w_out] = std::max(std::max(max1, max2), max3); output_seg[i * w_out] = std::max(std::max(max1, max2), max3);
max1 = std::max(input_data[i * w_in - w_in + w_in - 2], max1 = std::max(input_seg[i * w_in - w_in + w_in - 2],
input_data[i * w_in - w_in + 1 + w_in - 2]); input_seg[i * w_in - w_in + 1 + w_in - 2]);
max2 = std::max(input_data[i * w_in + w_in - 2], max2 = std::max(input_seg[i * w_in + w_in - 2],
input_data[i * w_in + 1 + w_in - 2]); input_seg[i * w_in + 1 + w_in - 2]);
max3 = std::max(input_data[i * w_in + w_in + w_in - 2], max3 = std::max(input_seg[i * w_in + w_in + w_in - 2],
input_data[i * w_in + w_in + 1 + w_in - 2]); input_seg[i * w_in + w_in + 1 + w_in - 2]);
out_data[i * w_out + w_out - 1] = std::max(std::max(max1, max2), max3); output_seg[i * w_out + w_out - 1] =
std::max(std::max(max1, max2), max3);
} }
// top 1 row & bottom 1 row // top 1 row & bottom 1 row
const float *input_tmp = input_data; const float *input_tmp = input_seg;
float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2, float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
tmp3, tmp4, tmp5, max; tmp3, tmp4, tmp5, max;
...@@ -329,7 +347,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { ...@@ -329,7 +347,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
in4 = vld1q_f32(input_tmp_end); in4 = vld1q_f32(input_tmp_end);
in6 = vld1q_f32(input_tmp_end + w_in); in6 = vld1q_f32(input_tmp_end + w_in);
int c_mid = w_out - 2; int c_mid = w_out - 2;
auto output_ptr = out_data + 1; auto output_ptr = output_seg + 1;
for (; c_mid > 3; c_mid -= 4) { for (; c_mid > 3; c_mid -= 4) {
in1 = vld1q_f32(input_tmp + 4); in1 = vld1q_f32(input_tmp + 4);
in3 = vld1q_f32(input_tmp + w_in + 4); in3 = vld1q_f32(input_tmp + w_in + 4);
...@@ -373,8 +391,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { ...@@ -373,8 +391,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
in6 = in7; in6 = in7;
} }
// top right remain // top right remain
float32x4_t pad0 = vdupq_n_f32(input_data[w_in - 1]); float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[2 * w_in - 1]); float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
tmp0 = vextq_f32(in0, pad0, 1); tmp0 = vextq_f32(in0, pad0, 1);
tmp1 = vextq_f32(in0, pad0, 2); tmp1 = vextq_f32(in0, pad0, 2);
...@@ -400,8 +418,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { ...@@ -400,8 +418,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
} }
// bottom_right remain // bottom_right remain
float32x4_t pad2 = vdupq_n_f32(input_data[(h_in - 1) * w_in - 1]); float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
float32x4_t pad3 = vdupq_n_f32(input_data[h_in * w_in - 1]); float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
tmp0 = vextq_f32(in4, pad2, 1); tmp0 = vextq_f32(in4, pad2, 1);
tmp1 = vextq_f32(in4, pad2, 2); tmp1 = vextq_f32(in4, pad2, 2);
...@@ -427,8 +445,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { ...@@ -427,8 +445,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
} }
// mid // mid
for (int j = 0; j < h_out - 2; ++j) { for (int j = 0; j < h_out - 2; ++j) {
output_ptr = out_data + (j + 1) * w_out + 1; output_ptr = output_seg + (j + 1) * w_out + 1;
input_tmp = input_data + j * w_in; input_tmp = input_seg + j * w_in;
in0 = vld1q_f32(input_tmp); in0 = vld1q_f32(input_tmp);
in2 = vld1q_f32(input_tmp + w_in); in2 = vld1q_f32(input_tmp + w_in);
...@@ -463,9 +481,9 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { ...@@ -463,9 +481,9 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
in4 = in5; in4 = in5;
} }
// mid remain // mid remain
float32x4_t pad0 = vdupq_n_f32(input_data[(j + 1) * w_in - 1]); float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[(j + 2) * w_in - 1]); float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
float32x4_t pad2 = vdupq_n_f32(input_data[(j + 3) * w_in - 1]); float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 3) * w_in - 1]);
tmp0 = vextq_f32(in0, pad0, 1); tmp0 = vextq_f32(in0, pad0, 1);
tmp1 = vextq_f32(in0, pad0, 2); tmp1 = vextq_f32(in0, pad0, 2);
...@@ -495,9 +513,11 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { ...@@ -495,9 +513,11 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
} }
} }
} }
input_data += inputdata_channel_stride; // input_data += inputdata_channel_stride;
out_data += outputdata_channel_stride; // out_data += outputdata_channel_stride;
} }
input_data += input_batch_stride;
out_data += output_batch_stride;
} }
#endif #endif
} }
...@@ -515,11 +535,11 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -515,11 +535,11 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
const int output_height = output->dims()[2]; const int output_height = output->dims()[2];
const int output_width = output->dims()[3]; const int output_width = output->dims()[3];
const int _kernel_size = 3; // const int _kernel_size = 3;
const int stride_height = strides[0]; const int stride = strides[0];
const int stride_width = strides[1]; // const int stride_width = strides[1];
const int padding_height = paddings[0]; const int padding = paddings[0];
const int padding_width = paddings[1]; // const int padding_width = paddings[1];
const float negative_max = -INT_MAX; const float negative_max = -INT_MAX;
const int input_channel_stride = input_height * input_width; const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width; const int output_channel_stride = output_height * output_width;
...@@ -529,36 +549,39 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -529,36 +549,39 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
const int input_batch_stride = output_channels * input_channel_stride; const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride; const int output_batch_stride = output_channels * output_channel_stride;
const float *pos1, *pos2, *pos3, *output_ptr; const float *pos1, *output_ptr;
int hstart, wstart, hend, wend; int hstart, wstart, hend, wend;
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
#pragma omp parallel for
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
const float *input_seg = input_data + c * input_channel_stride;
float *output_seg = output_data + c * output_channel_stride;
for (int ph = 0; ph < output_height; ph++) { for (int ph = 0; ph < output_height; ph++) {
for (int pw = 0; pw < output_width; pw++) { for (int pw = 0; pw < output_width; pw++) {
hstart = ph * stride_height - padding_height; int hstart = ph * stride - padding;
wstart = pw * stride_width - padding_width; int wstart = pw * stride - padding;
hend = min(hstart + _kernel_size, input_height + padding_height); int hend = min(hstart + 3, input_height + padding);
wend = min(wstart + _kernel_size, input_width + padding_width); int wend = min(wstart + 3, input_width + padding);
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
hend = min(hend, input_height); hend = min(hend, input_height);
wend = min(wend, input_width); wend = min(wend, input_width);
pos1 = input_data + hstart * input_width + wstart; const float *pos1 = input_seg + hstart * input_width + wstart;
pos2 = input_data + (hstart + 1) * input_width + wstart; const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
pos3 = input_data + (hstart + 2) * input_width + wstart; const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
output_ptr = output_data + ph * output_width + pw; output_ptr = output_seg + ph * output_width + pw;
if (hend - hstart != 3 || wend - wstart != 3) { if (hend - hstart != 3 || wend - wstart != 3) {
float max_value = -INT_MAX; float max_value = -INT_MAX;
for (int h = hstart; h < hend; h++) { for (int h = hstart; h < hend; h++) {
for (int w = wstart; w < wend; w++) { for (int w = wstart; w < wend; w++) {
float value = input_data[h * input_width + w]; float value = input_seg[h * input_width + w];
if (value > max_value) { if (value > max_value) {
max_value = value; max_value = value;
} }
} }
} }
output_data[ph * output_width + pw] = max_value; output_seg[ph * output_width + pw] = max_value;
} else { } else {
#if defined(ARMV7) #if defined(ARMV7)
asm volatile( asm volatile(
...@@ -572,27 +595,25 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -572,27 +595,25 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
"vpmax.f32 d7, d6, d6 \n\t" "vpmax.f32 d7, d6, d6 \n\t"
"vst1.32 {d7[0]},[%[output_ptr]] \n\t" "vst1.32 {d7[0]},[%[output_ptr]] \n\t"
: :
: [input_data] "r"(input_data), [pos1] "r"(pos1), : [input_seg] "r"(input_seg), [pos1] "r"(pos1),
[pos2] "r"(pos2), [pos3] "r"(pos3), [pos2] "r"(pos2), [pos3] "r"(pos3),
[output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max) [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
: "memory", "q1", "q2", "q3", "q4"); : "memory", "q1", "q2", "q3", "q4");
#else #else
const float32x4_t data1 = vld1q_f32(pos1); const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos2); const float32x4_t data2 = vld1q_f32(pos1 + input_width);
const float32x4_t data3 = vld1q_f32(pos3); const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
const float32x4_t max_data = const float32x4_t max_data =
vmaxq_f32(vmaxq_f32(data1, data3), data2); vmaxq_f32(vmaxq_f32(data1, data2), data3);
float32x2_t res = float32x2_t res =
vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)), vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
vget_low_f32(max_data)); vget_low_f32(max_data));
res = vpmax_f32(res, res); res = vpmax_f32(res, res);
output_data[ph * output_width + pw] = vget_lane_f32(res, 0); output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
#endif #endif
} }
} }
} }
input_data += input_channel_stride;
output_data += output_channel_stride;
} }
input_data += input_batch_stride; input_data += input_batch_stride;
output_data += output_batch_stride; output_data += output_batch_stride;
...@@ -613,11 +634,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -613,11 +634,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
const int output_height = output->dims()[2]; const int output_height = output->dims()[2];
const int output_width = output->dims()[3]; const int output_width = output->dims()[3];
const int _kernel_size = 3; const int stride = strides[0];
const int stride_height = strides[0]; const int padding = paddings[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int input_channel_stride = input_height * input_width; const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width; const int output_channel_stride = output_height * output_width;
...@@ -631,30 +649,33 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -631,30 +649,33 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
const int input_batch_stride = output_channels * input_channel_stride; const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride; const int output_batch_stride = output_channels * output_channel_stride;
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
#pragma omp parallel for
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
const float *input_seg = input_data + c * input_channel_stride;
float *output_seg = output_data + c * output_channel_stride;
for (int ph = 0; ph < output_height; ph++) { for (int ph = 0; ph < output_height; ph++) {
for (int pw = 0; pw < output_width; pw++) { for (int pw = 0; pw < output_width; pw++) {
int hstart = ph * stride_height - padding_height; int hstart = ph * stride - padding;
int wstart = pw * stride_width - padding_width; int wstart = pw * stride - padding;
int hend = min(hstart + _kernel_size, input_height + padding_height); int hend = min(hstart + 3, input_height + padding);
int wend = min(wstart + _kernel_size, input_width + padding_width); int wend = min(wstart + 3, input_width + padding);
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
hend = min(hend, input_height); hend = min(hend, input_height);
wend = min(wend, input_width); wend = min(wend, input_width);
const float *pos1 = input_data + hstart * input_width + wstart; const float *pos1 = input_seg + hstart * input_width + wstart;
const float *pos2 = input_data + (hstart + 1) * input_width + wstart; const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
const float *pos3 = input_data + (hstart + 2) * input_width + wstart; const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
const float *output_ptr = output_data + ph * output_width + pw; float *output_ptr = output_seg + ph * output_width + pw;
if (hend - hstart != 3 || wend - wstart != 3) { if (hend - hstart != 3 || wend - wstart != 3) {
float sum = 0; float sum = 0;
for (int h = hstart; h < hend; h++) { for (int h = hstart; h < hend; h++) {
for (int w = wstart; w < wend; w++) { for (int w = wstart; w < wend; w++) {
sum += input_data[h * input_width + w]; sum += input_seg[h * input_width + w];
} }
} }
output_data[ph * output_width + pw] = sum / 9.0; output_seg[ph * output_width + pw] = sum / 9.0;
} else { } else {
#if defined(ARMV7) #if defined(ARMV7)
...@@ -671,7 +692,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -671,7 +692,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
"vmul.f32 d6,d7 \n\t" "vmul.f32 d6,d7 \n\t"
"vst1.32 {d6[0]},[%[output_ptr]] \n\t" "vst1.32 {d6[0]},[%[output_ptr]] \n\t"
: :
: [input_data] "r"(input_data), [pos1] "r"(pos1), : [input_seg] "r"(input_seg), [pos1] "r"(pos1),
[pos2] "r"(pos2), [pos3] "r"(pos3), [pos2] "r"(pos2), [pos3] "r"(pos3),
[output_ptr] "r"(output_ptr), [zero] "r"(zero), [output_ptr] "r"(output_ptr), [zero] "r"(zero),
[nine_ptr] "r"(nine_ptr) [nine_ptr] "r"(nine_ptr)
...@@ -686,13 +707,11 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -686,13 +707,11 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)), vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)),
vget_low_f32(sum_data)); vget_low_f32(sum_data));
res = vpadd_f32(res, res); res = vpadd_f32(res, res);
output_data[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0; output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
#endif #endif
} }
} }
} }
input_data += input_channel_stride;
output_data += output_channel_stride;
} }
input_data += input_batch_stride; input_data += input_batch_stride;
output_data += output_batch_stride; output_data += output_batch_stride;
......
...@@ -15,6 +15,9 @@ limitations under the License. */ ...@@ -15,6 +15,9 @@ limitations under the License. */
#ifdef POOL_OP #ifdef POOL_OP
#pragma once #pragma once
#ifdef _OPENMP
#include <omp.h>
#endif
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "framework/tensor.h" #include "framework/tensor.h"
......
...@@ -16,6 +16,9 @@ limitations under the License. */ ...@@ -16,6 +16,9 @@ limitations under the License. */
#include "pooling.h" #include "pooling.h"
#include "common/types.h" #include "common/types.h"
#ifdef _OPENMP
#include <omp.h>
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -57,8 +60,8 @@ class PoolFunctor<CPU, PoolProcess, T> { ...@@ -57,8 +60,8 @@ class PoolFunctor<CPU, PoolProcess, T> {
T *output_data = output->mutable_data<T>(); T *output_data = output->mutable_data<T>();
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
// #pragma omp parallel for
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
#pragma omp parallel for
for (int ph = 0; ph < output_height; ++ph) { for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height; int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height); int hend = std::min(hstart + ksize_height, input_height);
......
...@@ -26,16 +26,17 @@ int main() { ...@@ -26,16 +26,17 @@ int main() {
auto time2 = time(); auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time2) << "ms\n"; DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize); paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
executor.SetThreadNum(4);
std::vector<float> input; std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims); GetInput<float>(g_test_image_1x3x224x224, &input, dims);
auto time3 = time(); auto time3 = time();
int count = 1;
for (int i = 0; i < 10; ++i) { for (int i = 0; i < count; ++i) {
executor.Predict(input, dims); executor.Predict(input, dims);
} }
auto time4 = time(); auto time4 = time();
DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n"; DLOG << "predict cost :" << time_diff(time3, time4) / count << "ms\n";
return 0; return 0;
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册